In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import duckdb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np

In [2]:
df = pd.read_csv('train_dataset_full.csv')

In [3]:
#impute
def fill_gender_age_based_on_user_group(row):
    if ((pd.isna(row['gender'])) and (pd.notna(row['user_group_id']))):
      row['gender'] = 'Male' if row['user_group_id'] <= 6 else 'Female'
    if ((pd.isna(row['age_level'])) and (pd.notna(row['user_group_id']))):
      row['age_level'] = row['user_group_id'] if row['user_group_id'] <= 6 else row['user_group_id'] - 6
    if ((pd.isna(row['user_group_id'])) and (pd.notna(row['age_level'])) and (pd.notna(row['gender']))):
        row['user_group_id'] = row['age_level'] if row['gender'] == 'Male' else row['age_level'] + 6
    return row

def impute(df):
  df.dropna(how='all',inplace=True)
  df.dropna(subset=['is_click'], inplace=True)

  df[df['webpage_id'] != 13787].webpage_id[df['campaign_id'].notna()] = df.groupby('campaign_id')['webpage_id'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
  df.campaign_id[df['webpage_id'].notna()] = df.groupby('webpage_id')['campaign_id'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

  col_lists = ['gender','age_level','user_depth','user_group_id','city_development_index']
  for col in col_lists:
    subset_filled = df.groupby('user_id')[col].agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
    df[col] = df[col].fillna(df['user_id'].map(subset_filled))

  df = df.apply(fill_gender_age_based_on_user_group, axis=1)

  df.drop_duplicates(inplace=True)
  return df

In [4]:
#columns that must be without nulls before create new variables
def columns_cant_be_with_nulls(df,cols):
    for col in cols:
        df.dropna(subset=[col], inplace=True)
    return df

In [5]:
def create_new_variables(df):
  df.DateTime = pd.to_datetime(df.DateTime)
  df['hour'] = df.DateTime.dt.hour
  df['time_of_day'] = pd.cut(df['hour'],bins=[-np.inf, 6, 12, 18, np.inf],labels=['night', 'morning', 'afternoon', 'evening'])
  df['day_of_the_week'] = df.DateTime.dt.day_name()
  day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
  df['day_of_the_week'] = pd.Categorical(df['day_of_the_week'], categories=day_order, ordered=True)


  df = df.sort_values(by=['user_id', 'DateTime'])
  df['exposures_so_far'] = df.groupby('user_id').cumcount()
  df['first_exposure'] = df['exposures_so_far'].apply(lambda x: 1 if x == 0 else 0)

  df['clicks_so_far'] = df.groupby('user_id')['is_click'].cumsum()

  df['clicked_before'] = df['clicks_so_far'].apply(lambda x: 0 if x == 0 else 1)

  df['clicks_divided_exposures'] = np.where(df['exposures_so_far'] == 0, 0, df['clicks_so_far'] / df['exposures_so_far'])
  return df


In [6]:
# columns to drop - session_id, user_id, product_category_2, datetime
def drop_columns(df,cols):
    df.drop(columns=cols, inplace=True)
    return df

In [7]:
def defualt_nan(df):
  defaults = {"gender": df["gender"].mode().iloc[0],
              "age_level": df["age_level"].mode().iloc[0],
              "user_depth": df["user_depth"].mode().iloc[0],
              #"user_group_id": df["user_group_id"].mode().iloc[0],
              "var_1": df["var_1"].mode().iloc[0]}
    
  for column, default in defaults.items():
      df[column] = df[column].fillna(default)

  return df

In [8]:
def create_dummies(df,cols):
  for col in cols:
      dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
      df = pd.concat([df, dummies], axis=1)
      df = df.drop(col, axis=1)
      for dummy in dummies:
          df[dummy] = df[dummy].astype(int)
  return df

In [9]:
df = impute(df)
df = columns_cant_be_with_nulls(df,['user_id','session_id','DateTime'])
df = create_new_variables(df)
df = drop_columns(df,['session_id','user_id','DateTime','product_category_2','city_development_index','hour','user_group_id'])
df = defualt_nan(df)
df = create_dummies(df,['gender','product','campaign_id','webpage_id','time_of_day','day_of_the_week','product_category_1'])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['webpage_id'] != 13787].webpage_id[df['campaign_id'].notna()] = df.groupby('campaign_id')['webpage_id'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.campaign_id[df['webpage_id'].notna()] = df.groupby('webpage_id')['campaign_id'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


Unnamed: 0,age_level,user_depth,var_1,is_click,exposures_so_far,first_exposure,clicks_so_far,clicked_before,clicks_divided_exposures,gender_Male,...,day_of_the_week_Monday,day_of_the_week_Tuesday,day_of_the_week_Wednesday,day_of_the_week_Thursday,day_of_the_week_Friday,day_of_the_week_Saturday,product_category_1_2.0,product_category_1_3.0,product_category_1_4.0,product_category_1_5.0
189904,5.0,2.0,0.0,0.0,0,1,0.0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
99358,3.0,3.0,1.0,0.0,0,1,0.0,0,0.0,1,...,0,0,0,1,0,0,0,1,0,0
341084,0.0,3.0,1.0,0.0,0,1,0.0,0,0.0,1,...,0,0,1,0,0,0,1,0,0,0
364292,2.0,3.0,1.0,0.0,0,1,0.0,0,0.0,1,...,1,0,0,0,0,0,1,0,0,0
292983,4.0,2.0,1.0,0.0,0,1,0.0,0,0.0,1,...,0,0,0,1,0,0,0,0,0,1


In [10]:
df.isnull().sum(axis=0)

age_level                    0
user_depth                   0
var_1                        0
is_click                     0
exposures_so_far             0
first_exposure               0
clicks_so_far                0
clicked_before               0
clicks_divided_exposures     0
gender_Male                  0
product_B                    0
product_C                    0
product_D                    0
product_E                    0
product_F                    0
product_G                    0
product_H                    0
product_I                    0
product_J                    0
campaign_id_98970.0          0
campaign_id_105960.0         0
campaign_id_118601.0         0
campaign_id_359520.0         0
campaign_id_360936.0         0
campaign_id_396664.0         0
campaign_id_404347.0         0
campaign_id_405490.0         0
campaign_id_414149.0         0
webpage_id_6970.0            0
webpage_id_11085.0           0
webpage_id_13787.0           0
webpage_id_28529.0           0
webpage_

# models

In [13]:
!pip install dask dask-ml umap-learn lightgbm catboost

Collecting dask
  Downloading dask-2025.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dask-ml
  Downloading dask_ml-2024.4.4-py3-none-any.whl.metadata (5.9 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.7-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting cloudpickle>=3.0.0 (from dask)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting fsspec>=2021.09.0 (from dask)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting partd>=1.4.0 (from dask)
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting toolz>=0.10.0 (from dask)
  Using cached toolz-1.0.0-py3-none-any.whl.metadata (5.1 kB)
Collecting dask-glm>=0.2.0 (from dask-ml)
  Downloading dask_glm-0.3.2-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting distributed>

In [11]:
import numpy as np
import dask.array as da
from dask_ml.model_selection import train_test_split, GridSearchCV
from dask_ml.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA, FastICA, FactorAnalysis
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
#from umap import UMAP
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import make_scorer, f1_score
import itertools
import time
import warnings
warnings.filterwarnings("ignore")
import pickle
import os

In [12]:

y = df.is_click
X = df.drop(columns = ['is_click'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle = True)

In [13]:

# Define Scalers
scalers = {
    "StandardScaler": StandardScaler()
}

# Define Classifiers
models = {
    "RandomForest": RandomForestClassifier(n_estimators=30,max_features=10,warm_start=True,max_depth = 15, n_jobs=-1),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=100, n_jobs=-1),
    #"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_jobs=-1),
    #"LightGBM": LGBMClassifier(n_jobs=-1),
    #"CatBoost": CatBoostClassifier(verbose=0),
    "GradientBoosting": GradientBoostingClassifier(),
    #"SVM_RBF": SVC(kernel="rbf"),
    #"SVM_Poly": SVC(kernel="poly"),
    #"SVM_Sigmoid": SVC(kernel="sigmoid"),
    "LogisticRegression": LogisticRegression(max_iter=500),
    #"SGDClassifier": SGDClassifier(),
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
}

# Define Dimensionality Reduction Methods
dim_reductions = {
    "PCA": PCA(n_components=5),
    #"KPCA_RBF": KernelPCA(n_components=10, kernel="rbf", n_jobs=-1),
    #"KPCA_Poly": KernelPCA(n_components=10, kernel="poly", n_jobs=-1),
    #"UMAP": UMAP(n_components=10),
    #"t-SNE": TSNE(n_components=10),
    #"Isomap": Isomap(n_components=10),
    #"LaplacianEigenmaps": LocallyLinearEmbedding(n_components=10),
    #"FactorAnalysis": FactorAnalysis(n_components=10),
    #"ICA": FastICA(n_components=10),
}

# Define Feature Selection Methods
feature_selectors = {
    "MutualInfo": SelectKBest(score_func=mutual_info_classif, k=20),
    #"RFE (RandomForest)": RFE(estimator=RandomForestClassifier(n_estimators=50, n_jobs=-1), n_features_to_select=20),
    #"RFE (XGBoost)": RFE(estimator=XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_jobs=-1), n_features_to_select=20),
    #"Lasso": SelectKBest(score_func=lambda X, y: np.abs(Lasso(alpha=0.01).fit(X, y).coef_), k=20)
}

# Define Feature Engineering Methods
feature_engineering = {
    "PolynomialFeatures": PolynomialFeatures(degree=2, include_bias=False),
    #"FeatureInteraction": FunctionTransformer(lambda X: X[:, :10] * X[:, 10:20]),  # Example interaction
}

# Calculate Total Combinations
total_combinations = (
    len(models) * len(dim_reductions) * len(scalers) * len(feature_selectors) * len(feature_engineering)
)
completed_tasks = 0  # Counter for completed tasks

# Run Experiments in Parallel with Progress Tracker
results = []
results_file = "results.pkl"
if os.path.exists(results_file):
    with open(results_file, "rb") as f:
        results = pickle.load(f)

existing_combinations = {entry[0] for entry in results}

param_grids = {
    "RandomForest": {
        "classifier__n_estimators": [10, 30, 50],
        "classifier__max_features": [5, 10],
        "classifier__max_depth": [10, 15, 20],
        "classifier__class_weight": ["balanced"],
    },
    "ExtraTrees": {
        "classifier__n_estimators": [50, 100, 150],
        "classifier__max_features": [5, 10],
        "classifier__max_depth": [10, 20, None],
    },
    "XGBoost": {
        "classifier__learning_rate": [0.01, 0.1, 0.3],
        "classifier__n_estimators": [50, 100, 150],
        "classifier__max_depth": [3, 6, 9],
    },
    "LightGBM": {
        "classifier__learning_rate": [0.01, 0.1],
        "classifier__n_estimators": [50, 100, 150],
        "classifier__num_leaves": [15, 31, 63],
    },
    "GradientBoosting": {
        "classifier__learning_rate": [0.01, 0.1],
        #"classifier__n_estimators": [50, 100, 150],
        #"classifier__max_depth": [3, 6, 9],
    },
    "SVM_RBF": {
        "classifier__C": [0.1, 1, 10],
        "classifier__gamma": ["scale", "auto"],
    },
    "SVM_Poly": {
        "classifier__C": [0.1, 1, 10],
        "classifier__degree": [2, 3, 4],
        "classifier__gamma": ["scale", "auto"],
    },
    "SVM_Sigmoid": {
        "classifier__C": [0.1, 1, 10],
        "classifier__gamma": ["scale", "auto"],
    },
    "LogisticRegression": {
        "classifier__C": [0.01, 0.1, 1, 10],
        "classifier__penalty": ["l2"],
    },
    "NaiveBayes": {},
    "DecisionTree": {
        "classifier__max_depth": [5, 10, 15, None],
        #"classifier__criterion": ["gini", "entropy"],
    },
    "KNN": {
        "classifier__n_neighbors": [3, 5, 7],
        #"classifier__weights": ["uniform", "distance"],
    },
}
#param_grids = {
 #   "RandomForest": {"classifier__class_weight": ["balanced"]},
  #  "GradientBoosting": {"classifier__learning_rate": [0.01, 0.1]},
   # "SVM_RBF": {"classifier__C": [0.1, 1, 10]},
    #"SVM_Poly": {"classifier__C": [0.1, 1, 10]},
    #"SVM_Sigmoid": {"classifier__C": [0.1, 1, 10]},
    #"LogisticRegression": {"classifier__C": [0.1, 1, 10]},
    #"NaiveBayes": {}, 
    #"DecisionTree": {"classifier__max_depth": [5, 10, 15]},
    #"KNN": {"classifier__n_neighbors": [3, 5, 7]},
#}
        
for model_name, model in models.items():
    for dr_name, dr in dim_reductions.items():
        for scaler_name, scaler in scalers.items():
                    combination = f"{model_name}-{dr_name}-{scaler_name}"
                    print(combination)
        
                    # Skip if the combination already exists
                    if combination in existing_combinations:
                        print(f"Skipping existing combination: {combination}")
                        continue
                
                    pipeline = Pipeline([
                        ("scaler", scaler),
                        ("dim_red", dr),
                        ("classifier", model),
                    ])
                    
                    param_grid = param_grids.get(model_name, {})
                    best_f1 = -1

                    # Dask-ML GridSearch (Parallel) - Now Optimizing for F1-score
                    for params in [dict(zip(param_grid, v)) for v in itertools.product(*param_grid.values())]:
                        pipeline.set_params(**params)
                        pipeline.fit(X_train, y_train)
                        y_pred = pipeline.predict(X_test)
                        f1 = f1_score(y_test, y_pred)

                        if f1 > best_f1:
                            best_f1 = f1
                            
                    results.append((f"{model_name}-{dr_name}-{scaler_name}", best_f1))

                    with open("results.pkl", "wb") as f:
                        pickle.dump(results, f)

                    # Update Progress
                    completed_tasks += 1
                    remaining_tasks = total_combinations - completed_tasks
                    print(f"Progress: {completed_tasks}/{total_combinations}")

# Sort and Display Best Results
results.sort(key=lambda x: x[1], reverse=True)
import pandas as pd
df_results = pd.DataFrame(results, columns=["Model", "F1 Score"])
print(df_results)

RandomForest-PCA-StandardScaler
Skipping existing combination: RandomForest-PCA-StandardScaler
ExtraTrees-PCA-StandardScaler
Skipping existing combination: ExtraTrees-PCA-StandardScaler
GradientBoosting-PCA-StandardScaler
Skipping existing combination: GradientBoosting-PCA-StandardScaler
LogisticRegression-PCA-StandardScaler
Skipping existing combination: LogisticRegression-PCA-StandardScaler
NaiveBayes-PCA-StandardScaler
Skipping existing combination: NaiveBayes-PCA-StandardScaler
DecisionTree-PCA-StandardScaler
Skipping existing combination: DecisionTree-PCA-StandardScaler
KNN-PCA-StandardScaler
Skipping existing combination: KNN-PCA-StandardScaler
                                    Model  F1 Score
0         RandomForest-PCA-StandardScaler  0.656374
1                  KNN-PCA-StandardScaler  0.652931
2           ExtraTrees-PCA-StandardScaler  0.636903
3           ExtraTrees-PCA-StandardScaler  0.635543
4           ExtraTrees-PCA-StandardScaler  0.635129
5           ExtraTrees-PCA-St

#