In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, confusion_matrix, make_scorer, ConfusionMatrixDisplay, precision_score, recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, BaggingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
#from xgboost import XGBClassifier

In [4]:
df = pd.read_csv('/Users/antoinebertin/Documents/jedha/full_stack/projects_full_stack/crc/train_v1.csv')
df.shape

(283211, 6)

In [5]:
X = df.drop('converted', axis=1)
y = df['converted']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [7]:
X_train.dtypes

country                object
age                     int64
new_user                int64
source                 object
total_pages_visited     int64
dtype: object

In [8]:
# encoding and scaling
categorical_features = ['country', 'source']
numeric_features = ['age', 'new_user', 'total_pages_visited']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# pipeline1: LogisticRegression()

In [11]:
pipeline1 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression()) #LogisticRegression(class_weight='balanced')
    ]
)

pipeline1_model = pipeline1.fit(X_train, y_train) #fit_transform

pipeline1_y_pred_train = pipeline1_model.predict(X_train) #transform
pipeline1_y_pred_test = pipeline1_model.predict(X_test) #transform

print("f1 score from train pipeline1:", f1_score(y_train, pipeline1_y_pred_train))
print("f1 score from test pipeline1:", f1_score(y_test, pipeline1_y_pred_test))

f1 score from train pipeline1: 0.763334226748861
f1 score from test pipeline1: 0.766525166767738


# ❌ pipeline2

In [12]:
pipeline2 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(penalty='l2', C=0.01)) #penalty='l1' => Lasso | penalty='l2' => Ridge | penalty='elasticnet' => ElasticNet
    ]
)

pipeline2_model = pipeline2.fit(X_train, y_train)

pipeline2_y_pred_train = pipeline2_model.predict(X_train)
pipeline2_y_pred_test = pipeline2_model.predict(X_test)

print("f1 score from train pipeline1:", f1_score(y_train, pipeline2_y_pred_train))
print("f1 score from test pipeline1:", f1_score(y_test, pipeline2_y_pred_test))

f1 score from train pipeline1: 0.7519824993163796
f1 score from test pipeline1: 0.7624922887106724


# pipeline3: LogisticRegression(C=10, penalty='l1', solver='saga')

In [20]:
pipeline3 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(C=10, penalty='l1', solver='saga')) #penalty='l1' => Lasso | penalty='l2' => Ridge | penalty='elasticnet' => ElasticNet
    ]
)

pipeline3_model = pipeline3.fit(X_train, y_train)

pipeline3_y_pred_train = pipeline3_model.predict(X_train)
pipeline3_y_pred_test = pipeline3_model.predict(X_test)

print("f1 score from train pipeline1:", f1_score(y_train, pipeline3_y_pred_train))
print("f1 score from test pipeline1:", f1_score(y_test, pipeline3_y_pred_test))

f1 score from train pipeline1: 0.763334226748861
f1 score from test pipeline1: 0.766525166767738


# ❌ pipeline5 poly on important feature

In [11]:
# Define your features
categorical_features = ['country', 'source']
numeric_features = ['age']
important_feature = 'total_pages_visited'  # This is now a string, not a list

# Define a pipeline for the important feature: Apply PolynomialFeatures followed by StandardScaler
important_feature_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('scale', StandardScaler())
])

# Update the preprocessor to handle transformations correctly
preprocessor_poly = ColumnTransformer(
    transformers=[
        # Apply the pipeline to the important feature
        ('important_poly_scale', important_feature_pipeline, [important_feature]),
        # Scale the rest of the numeric features
        ('std_scaler', StandardScaler(), numeric_features),
        # Apply OneHotEncoder to the categorical features
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

In [12]:
# Define the pipeline with the updated preprocessor
pipeline5 = Pipeline([
    ('preprocessor', preprocessor_poly),
    ('classifier', LogisticRegression(C=10, penalty='l1', solver='saga'))
])

# Fit the model
pipeline5_model = pipeline5.fit(X_train, y_train)

# Make predictions
pipeline5_y_pred_train = pipeline5_model.predict(X_train)
pipeline5_y_pred_test = pipeline5_model.predict(X_test)

# Evaluate the model
print("F1 score from train pipeline5:", f1_score(y_train, pipeline5_y_pred_train))
print("F1 score from test pipeline5:", f1_score(y_test, pipeline5_y_pred_test))

F1 score from train pipeline5: 0.7316706525452683
F1 score from test pipeline5: 0.7473618870266916


# ❌ grid on pipeline1 for regularisation

In [None]:
param_grid_l1_l2_adjusted = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga'],  # 'liblinear' works well with 'l1' and 'l2'; 'saga' also supports 'l1' and 'l2'
    'classifier__class_weight': [None, 'balanced']  # Adding class weight adjustment for imbalance
}

In [None]:
grid_search = GridSearchCV(pipeline1, param_grid_l1_l2_adjusted, cv=5, scoring='f1', verbose=1, n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
# After fitting GridSearchCV
grid_search.best_estimator_.named_steps['classifier']

In [None]:
print("Best cross-validation score (Train only) F1:", grid_search.best_score_)

Best cross-validation score (Train only) F1: 0.7629758536462616


In [None]:
best_index = grid_search.best_index_ # index of the best std
std = grid_search.cv_results_['std_test_score'][best_index]
print(f'Best std: {std:.3f}')

0.007345604372722851

In [None]:
y_pred_train = grid_search.best_estimator_.predict(X_train)
y_pred_test = grid_search.best_estimator_.predict(X_test)

In [None]:
print("f1 score from train grid:", f1_score(y_train, y_pred_train))
print("f1 score from test grid:", f1_score(y_test, y_pred_test))

f1 score from train grid: 0.7629328957595648
f1 score from test grid: 0.7653429602888087


# ❌ SMOTE

In [None]:
# Assuming 'preprocessor' is defined elsewhere in your code
# Adjust your pipeline to include SMOTE
pipeline1_smote = ImbPipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
            # Add the SMOTE step here
        ('classifier', LogisticRegression(C=10, penalty='l1', solver='saga'))  # You can still specify class_weight if needed
    ]
)

# Fit the pipeline on the original training data
pipeline1_smote_model = pipeline1_smote.fit(X_train, y_train)

# Predict on training and test sets
pipeline1_smote_y_pred_train = pipeline1_smote_model.predict(X_train)
pipeline1_smote_y_pred_test = pipeline1_smote_model.predict(X_test)

# Calculate F1 scores
print("f1 score from train pipeline1:", f1_score(y_train, pipeline1_smote_y_pred_train))
print("f1 score from test pipeline1:", f1_score(y_test, pipeline1_smote_y_pred_test))

f1 score from train pipeline1: 0.5123601242394588
f1 score from test pipeline1: 0.5071554597137816


# pipeline6: Bagging of -> DecisionTreeClassifier(max_depth=10, min_samples_leaf=7, min_samples_split=10, random_state=42)

In [10]:
dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=7, min_samples_split=10) #hyperparameters found in a grid performed on the cloud
bagging_model = BaggingClassifier(estimator=dt, n_estimators = 20)

# Save predictions in CSV

In [42]:
test = pd.read_csv('/Users/antoinebertin/Documents/jedha/full_stack/projects_full_stack/crc/conversion_data_test.csv')

In [43]:
predictions = pipeline3_model.predict(test)
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])

In [45]:
predictions_df.to_csv('conversion_data_test_predictions_ANTOINE-logreg.csv', index=False)