<a href="https://colab.research.google.com/github/mariambabarkhan/cookie-classifier/blob/main/cookie_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Necessary Libraries

In [1]:
import joblib
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data=pd.read_csv('/content/processed_labelled_data.csv')

In [3]:
data = data.dropna(subset=['name'])

Data Splitting

In [110]:
df = data.copy()

In [111]:
df.rename(columns={'cookie_feature': 'combined_feature'}, inplace=True)

In [112]:
df

Unnamed: 0,name,domain,label,combined_feature
0,adrum bt,okta-emea.com,1.0,adrum bt okta-emea.com
1,adrum bta,okta-emea.com,1.0,adrum bta okta-emea.com
2,ide,doubleclick.net,3.0,ide doubleclick.net
3,tpc,adform.net,3.0,tpc adform.net
4,cfduid,instana.io,2.0,cfduid instana.io
...,...,...,...,...
616241,~api analytic,paper.li,3.0,~api analytic paper.li
616242,~u,mediaalpha.com,3.0,~u mediaalpha.com
616243,~u,mediaalpha.com,3.0,~u mediaalpha.com
616244,cf bm,marketo.com,0.0,cf bm marketo.com


In [113]:
df_no_duplicates = df.drop_duplicates()
df_no_duplicates

Unnamed: 0,name,domain,label,combined_feature
0,adrum bt,okta-emea.com,1.0,adrum bt okta-emea.com
1,adrum bta,okta-emea.com,1.0,adrum bta okta-emea.com
2,ide,doubleclick.net,3.0,ide doubleclick.net
3,tpc,adform.net,3.0,tpc adform.net
4,cfduid,instana.io,2.0,cfduid instana.io
...,...,...,...,...
616240,~g$q fn fn]fnwfnvfb fy$+sx}q c > z mhg k b ~ o...,addthis.com,3.0,~g$q fn fn]fnwfnvfb fy$+sx}q c > z mhg k b ~ o...
616241,~api analytic,paper.li,3.0,~api analytic paper.li
616242,~u,mediaalpha.com,3.0,~u mediaalpha.com
616244,cf bm,marketo.com,0.0,cf bm marketo.com


In [114]:
df = df_no_duplicates

In [115]:
feature_encoder = LabelEncoder()
df['combined_feature_encoded'] = feature_encoder.fit_transform(df['combined_feature'])
df['name_encoded'] = feature_encoder.fit_transform(df['name'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_feature_encoded'] = feature_encoder.fit_transform(df['combined_feature'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name_encoded'] = feature_encoder.fit_transform(df['name'])


In [116]:
X1 = df['name_encoded'].values.reshape(-1, 1)
X2 = df['combined_feature']
y = df['label']

In [117]:
X_train_name, X_test_name, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

In [118]:
X_train_combined, X_test_combined, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)

In [119]:
vectorizer_cv = CountVectorizer(
    token_pattern=r'\b\w[\w.~:@-]*\b',
)
X_train_combined_cv = vectorizer_cv.fit_transform(X_train_combined)
X_test_combined_cv = vectorizer_cv.transform(X_test_combined)

In [120]:
vectorizer_tfidf = TfidfVectorizer(
    token_pattern=r'\b\w[\w.~:@-]*\b',
    max_df=0.95,
    min_df=2,
    ngram_range=(1, 3),
    sublinear_tf=True
)
X_train_combined_tfidf = vectorizer_tfidf.fit_transform(X_train_combined)
X_test_combined_tfidf = vectorizer_tfidf.transform(X_test_combined)

##Initializing Models

Logistic Regression

In [98]:
lr_classifier1 = LogisticRegression(max_iter=1000, class_weight='balanced')

In [99]:
lr_classifier2 = LogisticRegression(max_iter=1000, class_weight='balanced')

Multinomial Naive Bayes

In [83]:
mnb_classifier = MultinomialNB()

Random Forest

In [84]:
rf_classifier = RandomForestClassifier(random_state=42)

##Training the Models

Logistic Regression

In [100]:
lr_classifier1.fit(X_train_combined_tfidf, y_train)

KeyboardInterrupt: 

In [None]:
lr_classifier2.fit(X_train_combined_cv, y_train)

Naive Bayes

In [None]:
mnb_classifier.fit(X_train_combined_tfidf, y_train)

Random Forest

In [None]:
rf_classifier.fit(X_train_name, y_train)

## Predictions on Test Data

In [None]:
y_pred_lr1 = lr_classifier1.predict(X_test_combined_tfidf)

In [None]:
y_pred_lr2 = lr_classifier2.predict(X_test_combined_cv)

In [None]:
y_pred_mnb = mnb_classifier.predict(X_test_combined_tfidf)

In [None]:
y_pred_rf = rf_classifier.predict(X_test_name)

##Performance Metrics

In [None]:
accuracy_lr_combined = accuracy_score(y_test, y_pred_lr1)
report_lr_combined = classification_report(y_test, y_pred_lr1)
confusion_matrix_lr_combined = confusion_matrix(y_test, y_pred_lr1)

print(f"Logistic Regression Accuracy (feature: combined_tfidf_ad): {accuracy_lr_combined}")
print(f"\nLogistic Regression Classification Report:\n{report_lr_combined}")
print(f"Confusion Matrix:\n{confusion_matrix_lr_combined}")

In [None]:
accuracy_lr_combined_cv = accuracy_score(y_test, y_pred_lr2)
report_lr_combined_cv = classification_report(y_test, y_pred_lr2)
confusion_matrix_lr_combined_cv = confusion_matrix(y_test, y_pred_lr2)

print(f"Logistic Regression Accuracy (feature: combined_cv): {accuracy_lr_combined_cv}")
print(f"\nLogistic Regression Classification Report:\n{report_lr_combined_cv}")
print(f"Confusion Matrix:\n{confusion_matrix_lr_combined_cv}")

In [None]:
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
report_mnb = classification_report(y_test, y_pred_mnb)
confusion_matrix_mnb = confusion_matrix(y_test, y_pred_mnb)

print(f"Naive Bayes Accuracy (feature: combined_tfidf): {accuracy_mnb}")
print(f"\nNaive Bayes Classification Report:\n{report_mnb}")
print(f"Confusion Matrix:\n{confusion_matrix_mnb}")

In [None]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print(f"Random Forest Accuracy (feature: name): {accuracy_rf}")
print(f"\nRandom Forest Classification Report:\n{report_rf}")
print(f"Confusion Matrix:\n{confusion_matrix_rf}")

##Saving the Model

In [None]:
joblib.dump(lr_classifier1, 'logistic_regression_model.pkl')
joblib.dump(mnb_classifier, 'naive_bayes_model.pkl')
joblib.dump(rf_classifier, 'random_forest_model_name.pkl')

## Classifiying New Data

In [None]:
lr_classifier_trained = joblib.load('logistic_regression_model.pkl')

In [None]:
file_path_new = '/content/cleaned_cookies.csv'
cookies_df = pd.read_csv(file_path_new)

In [None]:
cookies_df.to_csv('/content/cleaned_cookies.csv', index=False)

In [None]:
X_new = cookies_df['combined_feature']

In [None]:
X_new_tfidf = vectorizer_tfidf.transform(X_new)

In [None]:
y_pred_new = lr_classifier_trained.predict(X_new_tfidf)

In [None]:
cookies_df['label'] = y_pred_new

In [None]:
cookies_df

In [None]:
cookies_df.to_csv('/content/classified_cookies_lr.csv', index=False)