## AI CEP

In [46]:
#First we will import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from sklearnex import patch_sklearn # Imported to use Intel proccessor optimizations for scikit-learn library
patch_sklearn()
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from codecarbon import EmissionsTracker
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [None]:
# Dataset loading and preprcessing steps:
try:
    df = pd.read_csv("Ad_click_prediction_train (1).csv")
except FileNotFoundError:
    print("Error: 'Ad_click_prediction_train (1).csv' not found.")
    exit()

num_missing = df.isnull().sum()
print(f"amount of missing entries in dataset: {num_missing}")
#dropping prodduct_category_2 and city development_index due to high missing entries
df = df.drop('city_development_index', axis=1)
df = df.drop('product_category_2', axis=1)
df = df.drop('DateTime', axis = 1)
df = df.dropna() #excluding lines with missing entries

num_missing = df.isnull().sum()
print(f"amount of missing entries in dataset after removing missing entries: {num_missing}")

LE = LabelEncoder()
df['gender'] = LE.fit_transform(df['gender'])
df['product'] = LE.fit_transform(df['product'])
df.info()

x = df.drop('is_click', axis=1)
y = df['is_click']

# class distribution at start
print("Class distribution before undersampling:", Counter(y))

# using undersampling to redduce dataset size and make it easier to process compared to oversampling
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=90)

# performing undersampling to x and y dataframes;
x_undersample, y_undersampled = undersampler.fit_resample(x, y)

# class distribution after undersampling:
print("Class distribution after undersampling:", Counter(y_undersampled))

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, shuffle=True, random_state=42)

amount of missing entries in dataset: session_id                     0
DateTime                       0
user_id                        0
product                        0
campaign_id                    0
webpage_id                     0
product_category_1             0
product_category_2        365854
user_group_id              18243
gender                     18243
age_level                  18243
user_depth                 18243
city_development_index    125129
var_1                          0
is_click                       0
dtype: int64
amount of missing entries in dataset after removing missing entries: session_id            0
user_id               0
product               0
campaign_id           0
webpage_id            0
product_category_1    0
user_group_id         0
gender                0
age_level             0
user_depth            0
var_1                 0
is_click              0
dtype: int64
Class distribution before undersampling: Counter({0: 414991, 1: 30057})
Class distri

In [None]:
# Plotting dataset paramters:

In [None]:
# setting up classification recording:
def evaluate_classifier(name, model, x_train = x_train, y_train = y_train, x_test = x_test, y_test = y_test, output_dir="emission_logs"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    emtracker = EmissionsTracker(project_name=name, output_dir=output_dir, log_level='ERROR')

    emtracker.start_task(f"Training model using {name}")
    training_start_time = time.time()
    model.fit(x_train, y_train)
    training_time = time.time() - training_start_time
    training_emissions = emtracker.stop_task()

    emtracker.start_task(f"Inference on trained model using {name}")
    infernece_start_time = time.time()
    y_pred = model.predict(x_test)
    inference_time = time.time() - infernece_start_time
    inference_emissions = emtracker.stop_task()

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"Model: {name}")
    print(f"Training time: {training_time}")
    print(f"Training emissions: {training_emissions.emissions}")
    print(f"Inference time: {inference_time}")
    print(f"Inference emissions: {inference_emissions.emissions}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("")

    return {
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score,
        'Training Time (s)': training_time,
        'TrainingeEmissions ': training_emissions.emissions,
        'Inference Time (s)' : inference_time,
        'Inference emissions' : inference_emissions.emissions
    }

classifiers = {
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "NaiveBayes": GaussianNB(),
    "SVM": SVC(probability=True, random_state=42)
}

classifiers_results = {}

In [None]:
#implementing classifiers
for name, model in classifiers.items():
    evaluate_classifier(name=name, classifier_model= model, output_dir="test1")

Model: LogisticRegression
Training time: 0.2197582721710205
Training emissions: 1.5576464165983525e-06
Inference time: 0.002202749252319336
Inference emissions: 1.0266600371353927e-06
Accuracy: 0.9318
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Model: KNN
Training time: 0.2424306869506836
Training emissions: 1.5085905897395994e-06
Inference time: 3.9327476024627686
Inference emissions: 3.322114067437935e-05
Accuracy: 0.9286
Precision: 0.1103
Recall: 0.0066
F1 Score: 0.0124
Model: NaiveBayes
Training time: 0.05916929244995117
Training emissions: 1.1418168105125682e-06
Inference time: 0.01430201530456543
Inference emissions: 1.0500865484389465e-06
Accuracy: 0.9318
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
