<a href="https://www.kaggle.com/code/omarmostafataha/ad-click-prediction-98-f1-score?scriptVersionId=143401129" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import plotly.express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Overview

In [None]:
df = pd.read_csv('/kaggle/input/advertising/advertising.csv')
df.head()

In [None]:
df.info()

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.select_dtypes('object').nunique()

In [None]:
df.drop(columns=['Ad Topic Line','City'],inplace=True)

In [None]:
df.describe()

# Features Distribution

In [None]:
num_features = df.select_dtypes('number').columns.drop('Clicked on Ad')
fig,ax = plt.subplots(1,5,figsize=(25,5))
for i,col in enumerate(num_features):
    sns.histplot(data=df,x=col,ax=ax[i],kde=True)
    ax[i].set_title(f'{col} Distribution')

In [None]:
plt.figure(figsize=(50,15))
sns.countplot(data=df,x='Country')
plt.xticks(rotation=60);

# Target Distribution

In [None]:
sns.countplot(data=df,x='Clicked on Ad');

Target Classes are balanced.

# Features Correlation with Target

In [None]:
fig,ax = plt.subplots(1,5,figsize=(25,5))
for i,col in enumerate(num_features):
    sns.histplot(data=df,x=col,ax=ax[i],kde=True,hue='Clicked on Ad')
    ax[i].set_title(f'{col} Distribution')

In [None]:
sns.heatmap(df.corr(),annot=True,cmap='Blues');

# Model Building

In [None]:
def model_evaluation(model, X_test, y_test, color='Blues'):
    """
    This function evaluates the performance of a trained model on the test set.
    
    Args:
        model: The trained machine learning model.
        X_test: The test data.
        y_test: The true labels for the test data.
        color: The color map to be used for plotting the confusion matrix.
    
    Returns:
        None
    """
    # Make predictions on the test set
    y_pred_test = model.predict(X_test)
    
    # Classification report
    print('--------------------------------------------------------------')    
    print("Classification Report:")
    print(classification_report(y_test, y_pred_test))
    print('--------------------------------------------------------------')    
    # Confusion matrix
    plt.figure(figsize=(5, 4))
    sns.heatmap(confusion_matrix(y_test, y_pred_test), annot=True, cmap=color)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
def model_tunning(model,X_train,y_train,parameters):
    '''
    This function receives a model, tunes it using GridSearchCV, prints the best parameters, 
    and returns the best estimator.
    
    Args:
        model: The machine learning model to be tuned.
        X_train: The training data.
        y_train: The target labels.
        parameters: The grid of hyperparameters to be tuned.
    
    Returns:
        best_estimator: The best estimator found during grid search.
    '''  
    grid_search = GridSearchCV(estimator=model,param_grid=parameters,cv=5,scoring='f1')
    grid_search.fit(X_train,y_train)
    print("Best parameters are: ",grid_search.best_params_)
    print('Mean cross-validated f1 score of the best estimator is: ',grid_search.best_score_)
    return grid_search.best_estimator_

In [None]:
def cross_validation(model,X_train,y_train,n):
    
    """
    This function is used to validate the model across multiple stratified splits.
    Args:
        model: The machine learning model to be evaluated.
        X_train: The training data.
        y_train: The target labels.
    Returns:
        None
    """
    splits = StratifiedKFold(n_splits=n,random_state=42,shuffle=True)
    validation_scores = cross_val_score(model,X_train,y_train,cv=splits,scoring='f1')
    print('Scoring Metric: f1')
    print('Cross Validation Scores: ',validation_scores)
    print('Scores Mean: ',validation_scores.mean())
    print('Scores Standard Deviation: ',validation_scores.std())
#     print('--------------------------------------------------------------')

In [None]:
from sklearn.model_selection import train_test_split , GridSearchCV , StratifiedKFold , cross_val_score , cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler 
from category_encoders import BinaryEncoder
from sklearn.metrics import confusion_matrix , classification_report , roc_auc_score , roc_curve ,f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier , plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier , RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier
from xgboost import XGBClassifier

import pickle

# Data Splitting and Preprocessing

In [None]:
features = df.columns.drop(['Clicked on Ad','Timestamp','Country'])
target = 'Clicked on Ad'

X = df[features]
y = df[target]

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 1. Logistic Regression

In [None]:
cross_validation(LogisticRegression(),X_train,y_train,5)

In [None]:
param_grid_log_reg = {
    'C': [0.001,0.01,0.1, 1, 10,100],   # Inverse of regularization strength
    'penalty': ['l1', 'l2'],  # Regularization penalty type
    'solver': ['liblinear', 'saga'],  # Solver algorithm
}
log_reg = model_tunning(LogisticRegression(),X_train,y_train,param_grid_log_reg)

In [None]:
model_evaluation(log_reg,X_test,y_test,'Reds')

# 2. Support Vector Classifier

In [None]:
cross_validation(SVC(),X_train,y_train,5)

In [None]:
param_grid_svc = {
    'C': [0.01,0.1, 1, 10,100],  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel function
    'degree': [2, 3, 4, 5],  # Degree of the polynomial kernel (for 'poly' kernel)
    'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels
}
svc = model_tunning(SVC(probability=True),X_train,y_train,param_grid_svc)

In [None]:
model_evaluation(svc,X_test,y_test,'Greens')

# 3. KNN Classifier

In [None]:
cross_validation(KNeighborsClassifier(),X_train,y_train,5)

In [None]:
param_grid_knn = {
    'n_neighbors':np.arange(1,21,2)
}
knn = model_tunning(KNeighborsClassifier(),X_train,y_train,param_grid_knn)

In [None]:
model_evaluation(knn,X_test,y_test,'Blues')

# 4. Decision Tree

In [None]:
cross_validation(DecisionTreeClassifier(),X_train,y_train,5)

In [None]:
param_grid_dec = {
    'criterion': ['gini', 'entropy'],  # The function to measure the quality of a split
    'max_depth': np.arange(1,30),  # The maximum depth of the tree
    'min_samples_split': np.arange(2,10),  # The minimum number of samples required to split an internal node
}
dec_tree = model_tunning(DecisionTreeClassifier(),X_train,y_train,param_grid_dec)

In [None]:
model_evaluation(dec_tree,X_test,y_test,'Reds')

In [None]:
plt.figure(figsize=(50,25))
plot_tree(
    dec_tree,
    feature_names=X.columns,
    class_names=['Not Clicked','Clicked'],
    filled=True,
    rounded=True,
    fontsize=16
);

In [None]:
feat_imp = pd.Series(dec_tree.feature_importances_,index=X.columns).sort_values(ascending=False)
sns.barplot(x=feat_imp,y=feat_imp.index);
plt.title('Feature Importances');

# 5. Voting Classifier

In [None]:
vot_clf = VotingClassifier(
    estimators=
    [
    ('lr',LogisticRegression()),
    ('svc',SVC(probability=True)),
    ('knn',KNeighborsClassifier()),
    ('dt',DecisionTreeClassifier())                       
    ],
    voting='soft'
)
vot_clf

In [None]:
cross_validation(vot_clf,X_train,y_train,5)

In [None]:
vot_clf.fit(X_train,y_train)

In [None]:
model_evaluation(vot_clf,X_test,y_test,'Greens')

# 6. Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=42)

In [None]:
cross_validation(rfc,X_train,y_train,5)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
model_evaluation(rfc,X_test,y_test)

In [None]:
feat_imp = pd.Series(rfc.feature_importances_,index=X.columns).sort_values(ascending=False)
sns.barplot(x=feat_imp,y=feat_imp.index);
plt.title('Feature Importances');

# 7. XGBoost Classifier

In [None]:
xgb_clf = XGBClassifier(n_estimators = 100 ,n_jobs = -1 ,random_state = 42)

In [None]:
cross_validation(xgb_clf,X_train,y_train,5)

In [None]:
xgb_clf.fit(X_train,y_train)

In [None]:
model_evaluation(xgb_clf,X_test,y_test,'Reds')

In [None]:
feat_imp = pd.Series(xgb_clf.feature_importances_,index=X.columns).sort_values(ascending=False)
sns.barplot(x=feat_imp,y=feat_imp.index);
plt.title('Feature Importances');

# 8. AdaBoost Classifier

In [None]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators=100,random_state=42)

In [None]:
cross_validation(ada_clf,X_train,y_train,5)

In [None]:
ada_clf.fit(X_train,y_train)

In [None]:
model_evaluation(ada_clf,X_test,y_test,'Greens')

In [None]:
feat_imp = pd.Series(ada_clf.feature_importances_,index=X.columns).sort_values(ascending=False)
sns.barplot(x=feat_imp,y=feat_imp.index);
plt.title('Feature Importances');

# Conclusion

<li> The best estimator is Support Vector Classifier with f1 score of 0.967 on training set and 0.98 on test set.
<li> Daily internet usage and Daily time spent on site have very strong negative correlation with the target.
<li> Age and Area income have moderate positive correlation with the target.
<li> Gender has no effect on the target.

In [None]:
with open("best_estimator.pkl",'wb') as file:
    pickle.dump(svc,file)