<a href="https://colab.research.google.com/github/mariambabarkhan/cookie-classifier/blob/main/cookie_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Necessary Libraries

In [138]:
import joblib
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [139]:
data=pd.read_csv('/content/processed_labelled_data.csv')

In [140]:
data = data.dropna(subset=['name'])

Data Splitting

In [141]:
df = data.copy()

In [142]:
df.rename(columns={'cookie_feature': 'combined_feature'}, inplace=True)

In [143]:
df

Unnamed: 0,name,domain,label,combined_feature
0,adrum bt,okta-emea.com,1.0,adrum bt okta-emea.com
1,adrum bta,okta-emea.com,1.0,adrum bta okta-emea.com
2,ide,doubleclick.net,3.0,ide doubleclick.net
3,tpc,adform.net,3.0,tpc adform.net
4,cfduid,instana.io,2.0,cfduid instana.io
...,...,...,...,...
616241,~api analytic,paper.li,3.0,~api analytic paper.li
616242,~u,mediaalpha.com,3.0,~u mediaalpha.com
616243,~u,mediaalpha.com,3.0,~u mediaalpha.com
616244,cf bm,marketo.com,0.0,cf bm marketo.com


In [144]:
name_encoder = LabelEncoder()
df['name_encoded'] = name_encoder.fit_transform(df['name'])

In [145]:
domain_encoder = LabelEncoder()
df['domain_encoded'] = domain_encoder.fit_transform(df['domain'])

In [146]:
feature_encoder = LabelEncoder()
df['combined_feature_encoded'] = feature_encoder.fit_transform(df['combined_feature'])

In [147]:
y = df['label']

In [148]:
X_train_name_encoded, X_test_name_encoded, y_train_name_encoded, y_test_name_encoded = train_test_split(df[['name_encoded','domain_encoded','combined_feature_encoded']], y, test_size=0.2, random_state=42)

In [150]:
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(df['combined_feature'], df['label'], test_size=0.2, random_state=42)

In [151]:
vectorizer_cv = CountVectorizer(token_pattern=r'\b\w[\w.~:@-]*\b')
X_train_combined_cv = vectorizer_cv.fit_transform(X_train_combined)
X_test_combined_cv = vectorizer_cv.transform(X_test_combined)

In [152]:
vectorizer_tfidf_ad = TfidfVectorizer(
    token_pattern=r'\b\w[\w.~:@-]*\b',
    max_df=0.95,
    min_df=2,
    ngram_range=(1, 3),
    sublinear_tf=True
)
X_train_combined_tfidf_ad = vectorizer_tfidf_ad.fit_transform(X_train_combined)
X_test_combined_tfidf_ad = vectorizer_tfidf_ad.transform(X_test_combined)

##Initializing Models

Logistic Regression

In [167]:
lr_classifier = LogisticRegression(max_iter=1000)

Multinomial Naive Bayes

In [154]:
mnb_classifier = MultinomialNB()

Random Forest

In [155]:
rf_classifier = RandomForestClassifier(random_state=42)

##Training the Models

Logistic Regression

In [168]:
lr_classifier.fit(X_train_combined_tfidf_ad, y_train_combined)

Naive Bayes

In [158]:
mnb_classifier.fit(X_train_combined_tfidf_ad, y_train_combined)

Random Forest

In [163]:
rf_classifier.fit(X_train_name_encoded, y_train_name_encoded)

## Predictions on Test Data

In [169]:
y_pred_lr = lr_classifier.predict(X_test_combined_tfidf_ad)

In [170]:
y_pred_mnb = mnb_classifier.predict(X_test_combined_tfidf_ad)

In [171]:
y_pred_rf = rf_classifier.predict(X_test_name_encoded)

##Performance Metrics

In [172]:
accuracy_lr_combined_ad = accuracy_score(y_test_combined, y_pred_lr)
report_lr_combined_ad = classification_report(y_test_combined, y_pred_lr)
confusion_matrix_lr_combined_ad = confusion_matrix(y_test_combined, y_pred_lr)

print(f"Logistic Regression Accuracy (feature: combined_tfidf_ad): {accuracy_lr_combined_ad}")
print(f"Logistic Regression Classification Report:\n{report_lr_combined_ad}")
print(f"Confusion Matrix:\n{confusion_matrix_lr_combined_ad}")

Logistic Regression Accuracy (feature: combined_tfidf_ad): 0.9219141737458316
Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.90      0.90     24309
         1.0       0.91      0.84      0.87     16422
         2.0       0.90      0.93      0.91     29910
         3.0       0.95      0.95      0.95     52608

    accuracy                           0.92    123249
   macro avg       0.91      0.91      0.91    123249
weighted avg       0.92      0.92      0.92    123249

Confusion Matrix:
[[21897   575  1111   726]
 [ 1303 13773   731   615]
 [  662   396 27742  1110]
 [  751   454  1190 50213]]


In [173]:
accuracy_mnb_ad = accuracy_score(y_test_combined, y_pred_mnb)
report_mnb_ad = classification_report(y_test_combined, y_pred_mnb)
confusion_matrix_mnb_ad = confusion_matrix(y_test_combined, y_pred_mnb)
print()
print(f"Naive Bayes Accuracy (feature: combined_tfidf_ad): {accuracy_mnb_ad}")
print(f"Naive Bayes Classification Report:\n{report_mnb_ad}")
print(f"Confusion Matrix:\n{confusion_matrix_mnb_ad}")


Naive Bayes Accuracy (feature: combined_tfidf_ad): 0.90010466616362
Naive Bayes Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.88      0.87     24309
         1.0       0.94      0.76      0.84     16422
         2.0       0.85      0.93      0.89     29910
         3.0       0.94      0.94      0.94     52608

    accuracy                           0.90    123249
   macro avg       0.90      0.88      0.88    123249
weighted avg       0.90      0.90      0.90    123249

Confusion Matrix:
[[21378   343  1592   996]
 [ 1729 12453  1180  1060]
 [  807   208 27708  1187]
 [  807   256  2147 49398]]


In [174]:
accuracy_rf = accuracy_score(y_test_name_encoded, y_pred_rf)
report_rf = classification_report(y_test_name_encoded, y_pred_rf)
confusion_matrix_rf = confusion_matrix(y_test_name_encoded, y_pred_rf)

print(f"Random Forest Accuracy (feature: name): {accuracy_rf}")
print(f"Random Forest Classification Report:\n{report_rf}")
print(f"Confusion Matrix:\n{confusion_matrix_rf}")

Random Forest Accuracy (feature: name): 0.8983034345106249
Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.87      0.87     24309
         1.0       0.85      0.83      0.84     16422
         2.0       0.89      0.89      0.89     29910
         3.0       0.94      0.94      0.94     52608

    accuracy                           0.90    123249
   macro avg       0.88      0.88      0.88    123249
weighted avg       0.90      0.90      0.90    123249

Confusion Matrix:
[[21240  1026  1098   945]
 [ 1258 13593   718   853]
 [ 1040   648 26617  1605]
 [ 1012   753  1578 49265]]


##Saving the Model

In [176]:
joblib.dump(lr_classifier, 'logistic_regression_model.pkl')
joblib.dump(mnb_classifier, 'naive_bayes_model.pkl')
joblib.dump(rf_classifier, 'random_forest_model.pkl')

['random_forest_model.pkl']

## Classifiying New Data

In [177]:
lr_classifier_trained = joblib.load('logistic_regression_model.pkl')

In [178]:
file_path_new = '/content/cleaned_cookies.csv'
cookies_df = pd.read_csv(file_path_new)

In [179]:
X_new = cookies_df['combined_feature']

In [180]:
X_new_tfidf = vectorizer_tfidf_ad.transform(X_new)

In [181]:
y_pred_new = lr_classifier_trained.predict(X_new_tfidf)

In [182]:
cookies_df['label'] = y_pred_new

In [183]:
cookies_df

Unnamed: 0,name,domain,tokens,combined_feature,label
0,csrfToken,www.1024tera.com,csrf token,csrf token www.1024tera.com,0.0
1,browserid,.1024tera.com,browserid,browserid .1024tera.com,0.0
2,__bid_n,.1024tera.com,bid n,bid n .1024tera.com,3.0
3,ndut_fmt,www.1024tera.com,ndut fmt,ndut fmt www.1024tera.com,3.0
4,ab_jid,.ymg-api.terabox.com,ab jid,ab jid .ymg-api.terabox.com,3.0
...,...,...,...,...,...
141105,_ga,.zus.pl,ga,ga .zus.pl,2.0
141106,TS51554626027,www.zus.pl,ts,ts www.zus.pl,0.0
141107,LFR_SESSION_STATE_10159,www.zus.pl,lfr session state,lfr session state www.zus.pl,0.0
141108,AWSALB,www.zyxel.com,awsalb,awsalb www.zyxel.com,0.0


In [185]:
cookies_df = cookies_df.drop(columns=['tokens', 'combined_feature'])
cookies_df

Unnamed: 0,name,domain,label
0,csrfToken,www.1024tera.com,0.0
1,browserid,.1024tera.com,0.0
2,__bid_n,.1024tera.com,3.0
3,ndut_fmt,www.1024tera.com,3.0
4,ab_jid,.ymg-api.terabox.com,3.0
...,...,...,...
141105,_ga,.zus.pl,2.0
141106,TS51554626027,www.zus.pl,0.0
141107,LFR_SESSION_STATE_10159,www.zus.pl,0.0
141108,AWSALB,www.zyxel.com,0.0


In [186]:
cookies_df.to_csv('/content/classified_cookies_lr.csv', index=False)