<a href="https://colab.research.google.com/github/mariambabarkhan/cookie-classifier/blob/main/cookie_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Necessary Libraries

In [1]:
import re
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
!gdown https://raw.githubusercontent.com/shaoormunir/purl/main/Pipeline/labelling_scripts/declared_cookie_labels.csv

Downloading...
From: https://raw.githubusercontent.com/shaoormunir/purl/main/Pipeline/labelling_scripts/declared_cookie_labels.csv
To: /content/declared_cookie_labels.csv
33.4MB [00:00, 80.4MB/s]


In [3]:
file_path = '/content/declared_cookie_labels.csv'
declared_cookie_labels_df = pd.read_csv(file_path)
data = declared_cookie_labels_df.iloc[:, 1:]

Preprocessing the Labelled Data


In [4]:
def tokenize_cookie_name(name):
    name = re.sub(r'\d+', '', name)
    tokens = re.split(r'[_\-%\.,\s]', name)
    tokens = [re.sub(r'([a-z])([A-Z])', r'\1 \2', token) for token in tokens]
    tokens_text = ' '.join(tokens)
    return tokens_text

In [5]:
data['tokens'] = data['name'].apply(tokenize_cookie_name)
print(data[['name', 'tokens']])

                  name          tokens
0           ADRUM_BT1        ADRUM BT 
1            ADRUM_BTa       ADRUM BTa
2                  IDE             IDE
3                  TPC             TPC
4             __cfduid          cfduid
...                ...             ...
708883  ~api/analytics  ~api/analytics
708884              ~u              ~u
708885              ~u              ~u
708886         __cf_bm           cf bm
708887            ouid            ouid

[708888 rows x 2 columns]


In [7]:
print("Missing values before preprocessing:\n", data.isna().sum())

Missing values before preprocessing:
 name              0
domain            0
declared_label    0
tokens            0
dtype: int64


In [8]:
data = data.dropna(subset=['declared_label'])

In [9]:
print("Missing values after preprocessing:\n", data.isna().sum())

Missing values after preprocessing:
 name              0
domain            0
declared_label    0
tokens            0
dtype: int64


Data Splitting

In [10]:
data

Unnamed: 0,name,domain,declared_label,tokens
0,ADRUM_BT1,okta-emea.com,1.0,ADRUM BT
1,ADRUM_BTa,okta-emea.com,1.0,ADRUM BTa
2,IDE,doubleclick.net,3.0,IDE
3,TPC,adform.net,3.0,TPC
4,__cfduid,instana.io,2.0,cfduid
...,...,...,...,...
708883,~api/analytics,paper.li,3.0,~api/analytics
708884,~u,mediaalpha.com,3.0,~u
708885,~u,mediaalpha.com,3.0,~u
708886,__cf_bm,marketo.com,0.0,cf bm


In [11]:
df = pd.DataFrame({
    'combined_feature': data['tokens'] + ' ' + data['domain'],
    'declared_label': data['declared_label']
})
df

Unnamed: 0,combined_feature,declared_label
0,ADRUM BT okta-emea.com,1.0
1,ADRUM BTa okta-emea.com,1.0
2,IDE doubleclick.net,3.0
3,TPC adform.net,3.0
4,cfduid instana.io,2.0
...,...,...
708883,~api/analytics paper.li,3.0
708884,~u mediaalpha.com,3.0
708885,~u mediaalpha.com,3.0
708886,cf bm marketo.com,0.0


In [43]:
feature_encoder = LabelEncoder()
df['combined_feature_encoded'] = feature_encoder.fit_transform(df['combined_feature'])
df

Unnamed: 0,combined_feature,declared_label,combined_feature_encoded
0,ADRUM BT okta-emea.com,1.0,1108
1,ADRUM BTa okta-emea.com,1.0,1109
2,IDE doubleclick.net,3.0,1303
3,TPC adform.net,3.0,2498
4,cfduid instana.io,2.0,8
...,...,...,...
708883,~api/analytics paper.li,3.0,420286
708884,~u mediaalpha.com,3.0,420287
708885,~u mediaalpha.com,3.0,420287
708886,cf bm marketo.com,0.0,44


In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    df['combined_feature_encoded'].values.reshape(-1, 1),
    df['declared_label'].values.reshape(-1, 1),
    test_size=0.2,
    random_state=42
)

In [44]:
X_train_og, X_test_og, y_train_og, y_test_og = train_test_split(df['combined_feature_encoded'], df['declared_label'], test_size=0.2, random_state=42)
X_train_og = df['combined_feature_encoded'].values.reshape(-1, 1)
y_train_og = df['declared_label'].values.reshape(-1, 1)
X_test_og = df['combined_feature_encoded'].values.reshape(-1, 1)
y_test_og = df['declared_label'].values.reshape(-1, 1)

In [14]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(df['combined_feature'], df['declared_label'], test_size=0.2, random_state=42)

In [15]:
vectorizer = TfidfVectorizer()
X_train2 = vectorizer.fit_transform(X_train2)
X_test2 = vectorizer.transform(X_test2)

##Initializing Models

Logistic Regression

In [20]:
lr = LogisticRegression(max_iter=1000)

Multinomial Naive Bayes

In [18]:
mnb = MultinomialNB()

Random Forest

In [39]:
rf_classifier = RandomForestClassifier(random_state=42)

XGBoost

In [21]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)

##Training the Models

Logistic Regression

In [23]:
lr.fit(X_train2, y_train2)

Random Forest

In [45]:
rf_classifier.fit(X_train_og, y_train_og)

  return fit_method(estimator, *args, **kwargs)


Naive Bayes

In [26]:
mnb.fit(X_train2, y_train2)

XGBoost

In [48]:
xgb_classifier.fit(X_train_og, y_train_og)

## Predictions on Test Data

In [24]:
y_pred_lr = lr.predict(X_test2)

In [27]:
y_pred_nb = mnb.predict(X_test2)

In [46]:
y_pred_rf = rf_classifier.predict(X_test_og)

In [49]:
y_pred_xgb = xgb_classifier.predict(X_test_og)

##Performance Metrics

In [47]:
accuracy_rf = accuracy_score(y_test_og, y_pred_rf)
report_rf = classification_report(y_test_og, y_pred_rf)
confusion_matrix_rf = confusion_matrix(y_test_og, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf}")
print(f"Random Forest Classification Report:\n{report_rf}")
print(f"Confusion Matrix:\n{confusion_matrix_rf}")

Random Forest Accuracy: 0.9929847755047154
Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99    120563
         1.0       0.99      0.98      0.99     82002
         2.0       0.99      0.99      0.99    150299
         3.0       1.00      1.00      1.00    264365

    accuracy                           0.99    617229
   macro avg       0.99      0.99      0.99    617229
weighted avg       0.99      0.99      0.99    617229

Confusion Matrix:
[[119563    316    346    338]
 [   598  80763    242    399]
 [   218    181 149351    549]
 [   294    333    516 263222]]


In [35]:
accuracy_mnb = accuracy_score(y_test2, y_pred_nb)
report_mnb = classification_report(y_test2, y_pred_nb)
confusion_matrix_mnb = confusion_matrix(y_test2, y_pred_nb)

print(f"Naive Bayes Accuracy: {accuracy_mnb}")
print(f"Naive Bayes Classification Report:\n{report_mnb}")
print(f"Confusion Matrix:\n{confusion_matrix_mnb}")

Naive Bayes Accuracy: 0.8796963854640896
Naive Bayes Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.86      0.85     24029
         1.0       0.95      0.70      0.80     16419
         2.0       0.82      0.91      0.86     30195
         3.0       0.92      0.93      0.93     52803

    accuracy                           0.88    123446
   macro avg       0.88      0.85      0.86    123446
weighted avg       0.88      0.88      0.88    123446

Confusion Matrix:
[[20779   220  1711  1319]
 [ 2270 11421  1527  1201]
 [ 1030   165 27438  1562]
 [  962   223  2661 48957]]


In [50]:
accuracy_xgb = accuracy_score(y_test_og, y_pred_xgb)
report_xgb = classification_report(y_test_og, y_pred_xgb)
confusion_matrix_xgb = confusion_matrix(y_test_og, y_pred_xgb)

print(f"XGBoost Accuracy: {accuracy_xgb}")
print(f"XGBoost Classification Report:\n{report_xgb}")
print(f"Confusion Matrix:\n{confusion_matrix_xgb}")

XGBoost Accuracy: 0.7785813693134963
XGBoost Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.73      0.70    120563
         1.0       0.81      0.60      0.69     82002
         2.0       0.80      0.78      0.79    150299
         3.0       0.80      0.86      0.83    264365

    accuracy                           0.78    617229
   macro avg       0.78      0.74      0.75    617229
weighted avg       0.78      0.78      0.78    617229

Confusion Matrix:
[[ 87445   3465   8117  21536]
 [ 12042  49285   6155  14520]
 [ 10651   2703 117133  19812]
 [ 18364   5116  14185 226700]]


In [37]:
accuracy_lr = accuracy_score(y_test2, y_pred_lr)
report_lr = classification_report(y_test2, y_pred_lr)
confusion_matrix_lr = confusion_matrix(y_test2, y_pred_lr)

print(f"Logistic Regression Accuracy: {accuracy_lr}")
print(f"Logistic Regression Classification Report:\n{report_lr}")
print(f"Confusion Matrix:\n{confusion_matrix_lr}")

Logistic Regression Accuracy: 0.9137841647359979
Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.89      0.89     24029
         1.0       0.89      0.83      0.86     16419
         2.0       0.90      0.91      0.91     30195
         3.0       0.95      0.95      0.95     52803

    accuracy                           0.91    123446
   macro avg       0.90      0.90      0.90    123446
weighted avg       0.91      0.91      0.91    123446

Confusion Matrix:
[[21505   658  1053   813]
 [ 1336 13595   812   676]
 [  813   451 27598  1333]
 [  864   504  1330 50105]]


##Saving the Model

In [51]:
joblib.dump(feature_encoder, 'feature_encoder.pkl')

['feature_encoder.pkl']

In [52]:
joblib.dump(rf_classifier, 'random_forest_model.pkl')

['random_forest_model.pkl']

## Classifiying New Data

In [53]:
rf_classifier_trained = joblib.load('random_forest_model.pkl')
feature_encoder = joblib.load('feature_encoder.pkl')

In [54]:
file_path_new = '/content/cookies_details.csv'
cookies_df = pd.read_csv(file_path_new)

In [55]:
print("Unique values in 'cookie name':", cookies_df['name'].nunique())
print("Unique values in 'cookie domain':", cookies_df['domain'].nunique())

Unique values in 'cookie name': 23079
Unique values in 'cookie domain': 12603


In [56]:
print("Missing values before preprocessing:\n", cookies_df.isna().sum())

Missing values before preprocessing:
 name      52
domain     0
dtype: int64


In [57]:
cookies_df = cookies_df.dropna(subset=['name'])

In [58]:
print("Missing values after preprocessing:\n", cookies_df.isna().sum())

Missing values after preprocessing:
 name      0
domain    0
dtype: int64


In [59]:
cookies_df['tokens'] = cookies_df['name'].apply(tokenize_cookie_name)
print(cookies_df[['name', 'tokens']])

                           name                 tokens
0                     csrfToken             csrf Token
1                     browserid              browserid
2                       __bid_n                  bid n
3                      ndut_fmt               ndut fmt
4                        ab_jid                 ab jid
...                         ...                    ...
141157                      _ga                     ga
141158            TS51554626027                     TS
141159  LFR_SESSION_STATE_10159     LFR SESSION STATE 
141160                   AWSALB                 AWSALB
141161      CookieScriptConsent  Cookie Script Consent

[141110 rows x 2 columns]


In [64]:
cookies_df['combined_feature'] = cookies_df['tokens'] + ' ' + cookies_df['domain']
cookies_df

Unnamed: 0,name,domain,tokens,combined_feature
0,csrfToken,www.1024tera.com,csrf Token,csrf Token www.1024tera.com
1,browserid,.1024tera.com,browserid,browserid .1024tera.com
2,__bid_n,.1024tera.com,bid n,bid n .1024tera.com
3,ndut_fmt,www.1024tera.com,ndut fmt,ndut fmt www.1024tera.com
4,ab_jid,.ymg-api.terabox.com,ab jid,ab jid .ymg-api.terabox.com
...,...,...,...,...
141157,_ga,.zus.pl,ga,ga .zus.pl
141158,TS51554626027,www.zus.pl,TS,TS www.zus.pl
141159,LFR_SESSION_STATE_10159,www.zus.pl,LFR SESSION STATE,LFR SESSION STATE www.zus.pl
141160,AWSALB,www.zyxel.com,AWSALB,AWSALB www.zyxel.com


In [65]:
cookies_df['combined_feature_encoded'] = feature_encoder.fit_transform(cookies_df['combined_feature'])
cookies_df

Unnamed: 0,name,domain,tokens,combined_feature,combined_feature_encoded
0,csrfToken,www.1024tera.com,csrf Token,csrf Token www.1024tera.com,48447
1,browserid,.1024tera.com,browserid,browserid .1024tera.com,45881
2,__bid_n,.1024tera.com,bid n,bid n .1024tera.com,615
3,ndut_fmt,www.1024tera.com,ndut fmt,ndut fmt www.1024tera.com,55801
4,ab_jid,.ymg-api.terabox.com,ab jid,ab jid .ymg-api.terabox.com,42647
...,...,...,...,...,...
141157,_ga,.zus.pl,ga,ga .zus.pl,15055
141158,TS51554626027,www.zus.pl,TS,TS www.zus.pl,41616
141159,LFR_SESSION_STATE_10159,www.zus.pl,LFR SESSION STATE,LFR SESSION STATE www.zus.pl,38409
141160,AWSALB,www.zyxel.com,AWSALB,AWSALB www.zyxel.com,34941


In [67]:
X_new = cookies_df.combined_feature_encoded.values.reshape(-1, 1)

y_pred_new = rf_classifier_trained.predict(X_new)

In [68]:
cookies_df['label'] = y_pred_new

In [69]:
cookies_df

Unnamed: 0,name,domain,tokens,combined_feature,combined_feature_encoded,label
0,csrfToken,www.1024tera.com,csrf Token,csrf Token www.1024tera.com,48447,3.0
1,browserid,.1024tera.com,browserid,browserid .1024tera.com,45881,3.0
2,__bid_n,.1024tera.com,bid n,bid n .1024tera.com,615,3.0
3,ndut_fmt,www.1024tera.com,ndut fmt,ndut fmt www.1024tera.com,55801,3.0
4,ab_jid,.ymg-api.terabox.com,ab jid,ab jid .ymg-api.terabox.com,42647,0.0
...,...,...,...,...,...,...
141157,_ga,.zus.pl,ga,ga .zus.pl,15055,2.0
141158,TS51554626027,www.zus.pl,TS,TS www.zus.pl,41616,3.0
141159,LFR_SESSION_STATE_10159,www.zus.pl,LFR SESSION STATE,LFR SESSION STATE www.zus.pl,38409,3.0
141160,AWSALB,www.zyxel.com,AWSALB,AWSALB www.zyxel.com,34941,2.0


In [70]:
cookies_df.to_csv('/content/classified_cookies.csv', index=False)