In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

In [2]:
# ==========================================
# 1. LOAD SAMPLED DATASET
# ==========================================
print("Loading the FULL cleaned dataset from Sprint 1...")
# Ensure you use the file from Sprint 1 without feature selection
df_full = pd.read_csv('vodafone_age_sample_30.csv')

print(f"Full dataset shape: {df_full.shape}")

X_full = df_full.drop(columns=['target'])
y_full = df_full['target']

# Train / Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.2, stratify=y_full, random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Loading the FULL cleaned dataset from Sprint 1...
Full dataset shape: (63000, 1288)
Training set shape: (50400, 1287)
Test set shape: (12600, 1287)


In [3]:
# ==========================================
# 2. ENCODING & PREPROCESSING (The Fix)
# ==========================================
print("\nSetting up ColumnTransformer for Encoding and Scaling...")

# Знаходимо, які колонки є текстом, а які числами
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Створюємо препроцесор:
# - Числа масштабуємо (StandardScaler)
# - Текст перетворюємо на числа (OrdinalEncoder)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols)
    ]
)


Setting up ColumnTransformer for Encoding and Scaling...


In [4]:
# ==========================================
# 3. PIPELINES SETUP
# ==========================================
pipelines_full = {
    "Logistic Regression (Full)": Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1))
    ]),
    "Random Forest (Full)": Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
    ]),
    "LightGBM (Full)": Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(random_state=42, n_jobs=-1))
    ])
}

results_full = []

In [5]:
# ==========================================
# 4. TRAINING & EVALUATION (WITH TIMER)
# ==========================================
print("\nStarting the massive training experiment...")
for name, pipeline in pipelines_full.items():
    print(f"Training {name}...")
    start_time = time.time()
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    execution_time = time.time() - start_time
    minutes = int(execution_time // 60)
    seconds = int(execution_time % 60)
    print(f"Done in {minutes}m {seconds}s")
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    results_full.append({
        "Model": name,
        "Accuracy": acc,
        "Precision (W)": prec,
        "Recall (W)": rec,
        "F1 Score (W)": f1,
        "Time (sec)": round(execution_time, 1)
    })


Starting the massive training experiment...
Training Logistic Regression (Full)...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done in 4m 20s
Training Random Forest (Full)...
Done in 0m 20s
Training LightGBM (Full)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.219941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63138
[LightGBM] [Info] Number of data points in the train set: 50400, number of used features: 1263
[LightGBM] [Info] Start training from score -3.671570
[LightGBM] [Info] Start training from score -2.956054
[LightGBM] [Info] Start training from score -1.655939
[LightGBM] [Info] Start training from score -1.240752
[LightGBM] [Info] Start training from score -1.246532
[LightGBM] [Info] Start training from score -1.864586




Done in 0m 26s


In [6]:
# ==========================================
# 5. RESULTS TABLE
# ==========================================
results_full_df = pd.DataFrame(results_full).sort_values(by="F1 Score (W)", ascending=False)
print("\n=== FULL DATASET EXPERIMENT RESULTS ===")
print(results_full_df.to_string(index=False))


=== FULL DATASET EXPERIMENT RESULTS ===
                     Model  Accuracy  Precision (W)  Recall (W)  F1 Score (W)  Time (sec)
           LightGBM (Full)  0.476587       0.479585    0.476587      0.476154        26.8
      Random Forest (Full)  0.437698       0.453320    0.437698      0.431442        20.8
Logistic Regression (Full)  0.401111       0.404712    0.401111      0.390435       260.9
