# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore')

 dataset link : https://www.kaggle.com/datasets/gabrielsantello/advertisement-click-on-ad

In [None]:
ads = pd.read_csv('advertising.csv')
df = ads.copy()
df.head()

# Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df['Clicked on Ad'].value_counts()

In [None]:
sns.countplot(x='Clicked on Ad', data=df)

In [None]:
sns.pairplot(data = df.drop("Male",axis=1), hue='Clicked on Ad')

In [None]:
sns.scatterplot(x='Daily Time Spent on Site', y='Age', hue='Clicked on Ad', data=df)

In [None]:
numerical_cols= ['Daily Time Spent on Site','Daily Internet Usage','Area Income','Age']


In [None]:
def boxPlot(df):
    for i,col in enumerate(numerical_cols,start=1):
        plt.subplot(2,2,i)
        sns.boxplot(x = 'Clicked on Ad', y = col, data = df, palette = 'coolwarm_r')
    plt.subplots_adjust(left = 0.1,right=0.9,wspace=0.4,hspace=0.4)
 
 
def distPlot(df):
    for i,col in enumerate(numerical_cols,start=1):
        plt.subplot(2,2,i)
        sns.distplot(df[col],bins=20)   
    plt.subplots_adjust(left = 0.1,right=0.9,wspace=0.4,hspace=0.4)
    
def linePlot(df):
    df['Months'] = df['Timestamp'].apply(lambda x: x.month)
    df['Days'] = df['Timestamp'].apply(lambda x: x.day)
    df['Hours'] = df['Timestamp'].apply(lambda x: x.hour)

    date_columns = ['Months','Hours','Days']

    for i,col in enumerate(date_columns,start=1):
        plt.subplot(2,2,i)
        sns.lineplot(df.groupby(col)['Clicked on Ad'].sum())
        plt.xlabel(col,size = 14)
        plt.ylabel('Clicked on Ad',size = 14)
        plt.xticks(size = 12)
        plt.yticks(size = 12)

    plt.suptitle('Sum of Clicked on Ad',size = 16)
    plt.subplots_adjust(left = 0.1,right=0.9,wspace=0.4,hspace=0.4)
    
def cal(df):
    print('Clicked on Ad Rates'.center(50,'_'))

    for col in numerical_cols:
        print('\n')
        print(f'{col} <= mean - std: {round(df[df[col] <= df[col].mean() - df[col].std()]["Clicked on Ad"].mean()*100,2)}%')
        print(f'{col} <= mean: {round(df[df[col] <= df[col].mean()]["Clicked on Ad"].mean()*100,2)}%')
        print(f'{col} >= mean: {round(df[df[col] >= df[col].mean()]["Clicked on Ad"].mean()*100,2)}%')
        print(f'{col} >= mean + std: {round(df[df[col] >= df[col].mean() + df[col].std()]["Clicked on Ad"].mean()*100,2)}%')


In [None]:
boxPlot(df)

In [None]:
cal(df)

In [None]:
distPlot(df)

In [None]:
linePlot(df)

In [None]:
#sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

In [None]:
import openpyxl
pv_timestamp = df.pivot_table(values='Clicked on Ad', index=['Days','Hours'],columns='Months', aggfunc='sum').fillna(0)
pv_timestamp.to_excel('timestamp.xlsx')
pv_timestamp

# Data Preprocessing

### - Create Independent and Dependent Variables

In [None]:
X = df.drop(['Clicked on Ad','Ad Topic Line','City','Country','Timestamp', 'Months','Days', 'Hours'], axis=1) # Drop Features
y = df['Clicked on Ad'].values # Target Variable

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score , f1_score
import pandas as pd

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier


# Model Building

In [None]:
def random_forest():
    classifier = RandomForestClassifier(criterion='entropy',n_estimators = 100,random_state=0).fit(X_train,y_train)
    return classifier

In [None]:
def decision_tree():
    classifier = DecisionTreeClassifier(
        criterion='gini',
        max_depth = 4,
        min_samples_split = 4,
        random_state=0
    ).fit(X_train,y_train)
    return classifier

In [None]:
def knn():
    classifier = KNeighborsClassifier(n_neighbors=9).fit(X_train,y_train)
    return classifier

In [None]:
def naive_bayes():
    classifier = GaussianNB().fit(X_train,y_train)
    return classifier

In [None]:
def mlp():
    classifier = MLPClassifier(
        early_stopping=True,
        batch_size=32,
        random_state=0
    ).fit(X_train,y_train)

    return classifier

In [None]:
def logistic_reg():
    classifier = LogisticRegression(random_state=0,C=1).fit(X_train,y_train)
    return classifier

In [None]:
def xgboost():
    classifier = XGBClassifier(learning_rate =  0.1,
                               max_depth = 4,
                               n_estimators = 100,
                               subsample = 0.8
                               ).fit(X_train,y_train)
    return classifier

In [None]:
classifiers = {'Random Forest':random_forest(),'Decision Tree':decision_tree(),'KNN':knn(),
               'Naive Bayes':naive_bayes(),'MLP':mlp(),'Logistic Regression':logistic_reg(),
               'XGBoost':xgboost()}

In [None]:
parameters = {
    'Random Forest': {"max_depth": [None,5,8,10],"n_estimators": [100,500,1000], 'criterion':['entropy','gini']},
    'Decision Tree': {'criterion': ['entropy','gini'], "max_depth": range(1,10), "min_samples_split" : list(range(2,50))},
    'KNN': {'n_neighbors': np.arange(1,20,step = 2)},
    'Logistic Regression': {'C':[1.0,2.0,3.0,4.0,5.0]},
    'XGBoost': {'n_estimators': [100, 500, 1000],'subsample': [0.6, 0.8, 1.0],'max_depth': [4, 5, 6],'learning_rate': [0.1, 0.01, 0.02]},
    'MLP': {'batch_size': [16,32,64],'early_stopping': [True,False]},
    'Naive Bayes': {}
    
    
    }

In [None]:
def grid_search(classifiers,parameters):
    best_params = {}
    for key in classifiers.keys():
        
        ## Add parameters to grid search
        grid_search = GridSearchCV(
            estimator = classifiers[key],
            param_grid = parameters[key],
            scoring = 'accuracy',
            cv = 10,
            n_jobs = -1,
            verbose = 1,
            return_train_score = True)
        
        ## Fit the model
        grid_search.fit(X_train,y_train)
        
        ## Get the best parameters and accuracy
        best_params[key] = grid_search.best_params_
        print(f"{key} best parameters: {grid_search.best_params_}")
        print(f"{key} best accuracy: {grid_search.best_score_}")
        
    return best_params


In [None]:
print("This process may take a while, please wait...")


for key in classifiers.keys():
    print(f"Grid Search for {key} is started")
    grid_search(classifiers,parameters)
    print(f"Grid Search for {key} is completed")
    print("\n")
    
    
    
    
    


# Model Evaluation

In [None]:
def evaluate(classifiers):
    results = pd.DataFrame(columns = ['Model','Accuracy','Precision',
                                      'Recall', 'F1 Score' , 'CVS (Mean)' , 'CVS (STD)%',
                                      'TP','TN','FP','FN'])
    for key in classifiers.keys():
        y_pred = classifiers[key].predict(X_test)
        results = results.append({
            'Model': key,
            'Accuracy': accuracy_score(y_test,y_pred),
            'Precision': precision_score(y_test,y_pred),
            'Recall': recall_score(y_test,y_pred),
            'F1 Score': f1_score(y_test,y_pred),
            'CVS (Mean)': cross_val_score(classifiers[key],X_train,y_train,cv=10).mean(),
            'CVS (STD)%': cross_val_score(classifiers[key],X_train,y_train,cv=10).std()*100,
            'TP': confusion_matrix(y_test,y_pred)[0][0],
            'TN': confusion_matrix(y_test,y_pred)[1][1],
            'FP': confusion_matrix(y_test,y_pred)[0][1],
            'FN': confusion_matrix(y_test,y_pred)[1][0]
            
        },ignore_index=True)
    return results


In [None]:

results = evaluate(classifiers)
results


# Model Comparison

In [None]:
## classification report

from sklearn.metrics import classification_report
for key in classifiers.keys():
    print(f"{key} classification report: \n{classification_report(y_test,classifiers[key].predict(X_test))}")
    

# Prediction

In [None]:
# df.describe().transpose()

In [None]:
# predictions = [[enter_values_of_each_column]]
# predictions = sc.transform(predictions)

# print(classifiers['enter_algorithm_name'].predict(predictions)[0])

# Save Model

In [None]:
# import pickle
# try:
#     with open('your_file_name', 'wb') as file:  
#         pickle.dump(classifiers['enter_algorithm_name'], file)
#     print('Model Saved')
# except:
#     print('Invalid Algorithm Name!')

# Load Model

In [None]:
# try:
#     with open('your_file_name', 'rb') as file:  
#         my_model = pickle.load(file)
#     print('Model Loaded')
# except:
#     print('Invalid Filename!')