In [1]:
!pip install -q pandas numpy scikit-learn matplotlib seaborn joblib xgboost
print("Packages installed (or already present).")

Packages installed (or already present).


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import os

folder_path = "/content/drive/MyDrive/colab_outputs"
os.makedirs(folder_path, exist_ok=True)

print("Folder dibuat di:", folder_path)

Folder dibuat di: /content/drive/MyDrive/colab_outputs


In [15]:
from google.colab import files
uploaded = files.upload()
import io, pandas as pd
fn = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[fn]))
print("Loaded:", fn, "shape:", df.shape)

Saving forestfires.csv to forestfires (1).csv
Loaded: forestfires (1).csv shape: (517, 13)


In [20]:
df.head()
df.info()
print("\nColumns:", list(df.columns))
print("\nMissing per column:\n", df.isna().sum())
TARGET_COLUMN = "label"
N_CLUSTERS = 3

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB

Columns: ['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area']

Missing per column:
 X        0
Y        0
month    0
day      0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
dtype: in

In [21]:
import os, json
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, mean_squared_error, r2_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import joblib, shutil

sns.set(style="whitegrid")

def ensure_dir(d):
    os.makedirs(d, exist_ok=True)

def save_fig(fig, path):
    fig.savefig(path, bbox_inches='tight', dpi=150)
    plt.close(fig)

def basic_eda(df, outdir):
    meta = {}
    meta['shape'] = df.shape
    meta['dtypes'] = df.dtypes.astype(str).to_dict()
    meta['missing'] = df.isna().sum().to_dict()
    meta['describe'] = df.describe(include='all').to_dict()
    with open(os.path.join(outdir, 'eda_summary.json'), 'w') as f:
        json.dump(meta, f, default=str, indent=2)
    # missing map
    fig = plt.figure(figsize=(8,4))
    sns.heatmap(df.isna(), cbar=False)
    plt.title('Missing values map')
    save_fig(fig, os.path.join(outdir, 'missing_map.png'))
    # correlation for numeric
    num = df.select_dtypes(include=[np.number])
    if num.shape[1] > 1:
        fig = plt.figure(figsize=(10,8))
        sns.heatmap(num.corr(), annot=True, fmt=".2f")
        plt.title('Correlation (numeric)')
        save_fig(fig, os.path.join(outdir, 'correlation.png'))
    # some distribution plots for numeric (up to 6)
    for col in num.columns[:6]:
        fig = plt.figure()
        sns.histplot(num[col].dropna(), kde=True)
        plt.title(f'Distribution: {col}')
        save_fig(fig, os.path.join(outdir, f'dist_{col}.png'))
    return meta

def prepare_pipeline(X):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
    num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
    cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
    preproc = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)], remainder='drop')
    return preproc

def run_classification(X, y, outdir):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    preproc = prepare_pipeline(X_train)
    pipes = {
        'logistic': Pipeline([('pre', preproc), ('clf', LogisticRegression(max_iter=1000))]),
        'random_forest': Pipeline([('pre', preproc), ('clf', RandomForestClassifier(n_estimators=200, random_state=42))])
    }
    results = {}
    for name, pipe in pipes.items():
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds, average='weighted')
        results[name] = {'accuracy': acc, 'f1_weighted': f1}
        joblib.dump(pipe, os.path.join(outdir, f'model_{name}.pkl'))
        if name == 'random_forest':
            cm = confusion_matrix(y_test, preds)
            fig = plt.figure(figsize=(6,5))
            sns.heatmap(cm, annot=True, fmt='d')
            plt.title('Confusion Matrix (Random Forest)')
            save_fig(fig, os.path.join(outdir, 'confusion_matrix_rf.png'))
            with open(os.path.join(outdir, 'classification_report_rf.txt'),'w') as f:
                f.write(classification_report(y_test, preds))
    with open(os.path.join(outdir, 'metrics_classification.json'),'w') as f:
        json.dump(results, f, indent=2)
    return results

def run_regression(X, y, outdir):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    preproc = prepare_pipeline(X_train)
    pipes = {
        'linear': Pipeline([('pre', preproc), ('reg', LinearRegression())]),
        'random_forest': Pipeline([('pre', preproc), ('reg', RandomForestRegressor(n_estimators=200, random_state=42))])
    }
    results = {}
    for name, pipe in pipes.items():
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        mse = mean_squared_error(y_test, preds)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, preds)
        results[name] = {'rmse': float(rmse), 'r2': float(r2)}
        joblib.dump(pipe, os.path.join(outdir, f'model_{name}.pkl'))
    with open(os.path.join(outdir, 'metrics_regression.json'),'w') as f:
        json.dump(results, f, indent=2)
    return results

def run_clustering(df, outdir, n_clusters=3):
    num = df.select_dtypes(include=[np.number]).dropna(axis=1, how='all')
    X = num.fillna(num.median())
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(Xs)
    df_out = df.copy()
    df_out['cluster'] = labels
    df_out.to_csv(os.path.join(outdir, 'clustered_data.csv'), index=False)
    sil = None
    if len(set(labels)) > 1:
        sil = float(silhouette_score(Xs, labels))
    with open(os.path.join(outdir, 'clustering_info.json'),'w') as f:
        json.dump({'inertia': float(kmeans.inertia_), 'n_clusters': n_clusters, 'silhouette': sil}, f)
    # elbow (inertia) optional: omitted for brevity
    return {'n_clusters': n_clusters, 'inertia': float(kmeans.inertia_), 'silhouette': sil}

def run_pipeline(df, target=None, n_clusters=3, save_to_drive=None):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    outdir = os.path.join('outputs', timestamp)
    ensure_dir(outdir)
    df.to_csv(os.path.join(outdir, 'dataset_copy.csv'), index=False)
    eda = basic_eda(df, outdir)
    summary = {'timestamp': timestamp, 'rows': int(df.shape[0]), 'cols': int(df.shape[1])}
    if target:
        if target not in df.columns:
            raise ValueError(f"Target {target} not found in dataset.")
        y = df[target]
        X = df.drop(columns=[target])
        prob = 'classification' if (y.dtype.kind not in 'ifu' or y.nunique() <= 20) else 'regression'
        summary['problem_type'] = prob
        if prob == 'classification':
            metrics = run_classification(X, y, outdir)
            summary['metrics'] = metrics
        else:
            metrics = run_regression(X, y, outdir)
            summary['metrics'] = metrics
    else:
        cluster_info = run_clustering(df, outdir, n_clusters=n_clusters)
        summary['clustering'] = cluster_info
    with open(os.path.join(outdir, 'run_summary.json'),'w') as f:
        json.dump(summary, f, indent=2)
    # optionally copy to Drive
    if save_to_drive:
        dst = os.path.join(save_to_drive, f'outputs_{timestamp}')
        shutil.copytree(outdir, dst)
        print("Copied outputs to Drive:", dst)
    print("Done. Outputs at:", outdir)
    return outdir

In [23]:
TARGET_COLUMN = "label"
outdir = run_pipeline(df, target=None, n_clusters=4, save_to_drive=None)
print("Outdir:", outdir)

Done. Outputs at: outputs/20250812_154408
Outdir: outputs/20250812_154408


In [24]:
import shutil
zip_name = "analysis_outputs"
shutil.make_archive(zip_name, 'zip', outdir)
from google.colab import files
files.download(zip_name + '.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>