In [None]:
!pip install joblib
!pip install pandas
!pip install scikit-learn  # Install scikit-learn if you haven't already
!pip install xgboost
!pip install tensorflow



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, MaxPooling1D, Conv1D, Reshape

In [None]:
!gdown https://raw.githubusercontent.com/shaoormunir/purl/main/Pipeline/labelling_scripts/declared_cookie_labels.csv

Downloading...
From: https://raw.githubusercontent.com/shaoormunir/purl/main/Pipeline/labelling_scripts/declared_cookie_labels.csv
To: /content/declared_cookie_labels.csv
33.4MB [00:00, 83.5MB/s]


In [None]:
# Load the dataset
file_path = '/content/declared_cookie_labels.csv'  # Replace with your file path
declared_cookie_labels_df = pd.read_csv(file_path)
annotated_data = declared_cookie_labels_df.iloc[:, 1:]

Preprocessing


In [None]:
annotated_data

Unnamed: 0,name,domain,declared_label
0,ADRUM_BT1,okta-emea.com,1.0
1,ADRUM_BTa,okta-emea.com,1.0
2,IDE,doubleclick.net,3.0
3,TPC,adform.net,3.0
4,__cfduid,instana.io,2.0
...,...,...,...
708883,~api/analytics,paper.li,3.0
708884,~u,mediaalpha.com,3.0
708885,~u,mediaalpha.com,3.0
708886,__cf_bm,marketo.com,0.0


In [None]:
# Check for missing values
print("Missing values before preprocessing:\n", annotated_data.isna().sum())

# Handle missing values
annotated_data = annotated_data.dropna(subset=['declared_label'])

# Check for missing values after handling
print("Missing values after preprocessing:\n", annotated_data.isna().sum())

# Initialize LabelEncoders for 'name' and 'domain'
name_encoder = LabelEncoder()
domain_encoder = LabelEncoder()

# Fit the LabelEncoders on the 'name' and 'domain' columns
name_encoder.fit(annotated_data['name'])
domain_encoder.fit(annotated_data['domain'])

# Encode the 'name' and 'domain' columns
name_encoded = name_encoder.transform(annotated_data['name'])
domain_encoded = domain_encoder.transform(annotated_data['domain'])

# Create new columns for encoded values
annotated_data['name_encoded'] = name_encoded
annotated_data['domain_encoded'] = domain_encoded

# Drop the original 'name' and 'domain' columns
encoded_data = annotated_data.drop(columns=['name', 'domain'])

# Check the processed data
declared_cookie_labels_df.head()

# Save the processed DataFrame if needed
encoded_data.to_csv('/content/processed_data.csv', index=False)

Missing values before preprocessing:
 name                  0
domain                0
declared_label    91659
dtype: int64
Missing values after preprocessing:
 name              0
domain            0
declared_label    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotated_data['name_encoded'] = name_encoded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotated_data['domain_encoded'] = domain_encoded


In [None]:
# Split the data into features and labels
y = encoded_data['declared_label']
X = encoded_data.drop(columns=['declared_label'])

# Check the shapes of the features and labels
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (617229, 2)
Shape of y: (617229,)


##Data Split

In [None]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Assuming X_train and y_train are already defined and preprocessed
# Convert labels to one-hot encoding if they are not already
y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes=4)
y_test_onehot = tf.keras.utils.to_categorical(y_test, num_classes=4)


##Training

Naive Bayes

In [None]:
# Train a Multinomial Naive Bayes classifier
mnb = MultinomialNB()

Random Forest

In [None]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)  # You can adjust hyperparameters here

XGBoost

In [None]:
# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)
# 'multi:softmax' for multiclass classification, 'num_class' is the number of classes

In [None]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', decision_function_shape='ovr', random_state=42)
# 'linear' kernel for linear SVM, 'ovr' for one-vs-rest multiclass strategy


Custom Model

In [None]:
# Define the model
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))  # Use Input layer to specify input shape
model.add(Reshape((X_train.shape[1], 1)))  # Reshape to (batch_size, timesteps, features)
model.add(LSTM(64))  # Add an LSTM layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

###Training Model

RF

In [None]:
rf_classifier.fit(X_train, y_train)

NB

In [None]:
mnb.fit(X_train, y_train)

XGBoost

In [None]:
# Train the classifier
xgb_classifier.fit(X_train, y_train)

SVM

In [None]:
# Train the classifier
svm_classifier. fit(X_train, y_train)

Custom Model

In [None]:
# Train the model
model.fit(X_train, y_train_onehot, epochs=10, batch_size=32, validation_data=(X_test, y_test_onehot))

Epoch 1/10
[1m15431/15431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 6ms/step - accuracy: 0.4279 - loss: 1.2922 - val_accuracy: 0.4289 - val_loss: 1.2865
Epoch 2/10


##Accuracy

### Predict

In [None]:
# Make predictions on the test set
y_pred_nb = mnb.predict(X_test)

In [None]:
# Make predictions on the test set
y_pred_rf = rf_classifier.predict(X_test)


In [None]:
# Make predictions on the test set
y_pred_xgb = xgb_classifier.predict(X_test)

In [None]:
# Make predictions on the test set
y_pred_svm = svm_classifier.predict(X_test)

###Score

In [None]:
# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf}")
print(f"Random Forest Classification Report:\n{report_rf}")

Random Forest Accuracy: 0.8985629344004666
Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.87      0.87     24029
         1.0       0.86      0.82      0.84     16419
         2.0       0.89      0.89      0.89     30195
         3.0       0.94      0.94      0.94     52803

    accuracy                           0.90    123446
   macro avg       0.88      0.88      0.88    123446
weighted avg       0.90      0.90      0.90    123446



In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_nb)
report = classification_report(y_test, y_pred_nb)

print(f"NB Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.2793691168608136
Classification Report:
              precision    recall  f1-score   support

         0.0       0.23      0.05      0.08     24029
         1.0       0.13      0.49      0.21     16419
         2.0       0.38      0.03      0.06     30195
         3.0       0.44      0.46      0.45     52803

    accuracy                           0.28    123446
   macro avg       0.29      0.26      0.20    123446
weighted avg       0.34      0.28      0.25    123446



In [None]:
# Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

print(f"XGBoost Accuracy: {accuracy_xgb}")
print(f"XGBoost Classification Report:\n{report_xgb}")

XGBoost Accuracy: 0.8283946016881876
XGBoost Classification Report:
              precision    recall  f1-score   support

         0.0       0.71      0.82      0.76     24029
         1.0       0.84      0.67      0.75     16419
         2.0       0.84      0.81      0.83     30195
         3.0       0.88      0.89      0.88     52803

    accuracy                           0.83    123446
   macro avg       0.82      0.80      0.80    123446
weighted avg       0.83      0.83      0.83    123446



In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_onehot)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

##Save Model

In [None]:
# Save the LabelEncoders for later use
joblib.dump(name_encoder, 'name_encoder.pkl')
joblib.dump(domain_encoder, 'domain_encoder.pkl')

In [None]:
# Save the trained model to a file
joblib.dump(mnb, 'multinomial_nb_model.pkl')

In [None]:
# Save the trained Random Forest model to a file
joblib.dump(rf_classifier, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [None]:
# Save the trained XGBoost model to a file
joblib.dump(xgb_classifier, 'xgboost_model.pkl')

In [None]:
# Save the trained SVM model to a file
joblib.dump(svm_classifier, 'svm_model.pkl')

In [None]:
# Save the trained TensorFlow model to a file
model.save('tensorflow_model.h5')

Classifiying New Data

In [None]:
# Load the trained model
mnb = joblib.load('multinomial_nb_model.pkl')

# Load the LabelEncoders
name_encoder = joblib.load('name_encoder.pkl')
domain_encoder = joblib.load('domain_encoder.pkl')

In [None]:
# Load the new dataset
file_path_new = '/content/cookies_details.csv'  # Replace with your file path
cookie_details_df = pd.read_csv(file_path_new)

In [None]:
print("Unique values in 'cookie name':", cookie_details_df['cookie name'].nunique())
print("Unique values in 'cookie domain':", cookie_details_df['cookie domain'].nunique())


Unique values in 'cookie name': 23079
Unique values in 'cookie domain': 12603


In [None]:
import numpy as np

# Check the actual column names
print("Column names in the dataset:\n", cookie_details_df.columns)

# Check for missing values
print("Missing values before preprocessing:\n", cookie_details_df.isna().sum())

# Handle missing values
# Drop rows with missing values in 'cookie name' or 'cookie domain'
cookie_details_df = cookie_details_df.dropna(subset=['cookie name', 'cookie domain'])

# Check for missing values after handling
print("Missing values after preprocessing:\n", cookie_details_df.isna().sum())

# Initialize LabelEncoders for 'cookie name' and 'cookie domain'
name_encoder = LabelEncoder()
domain_encoder = LabelEncoder()

# Fit the LabelEncoders on the 'cookie name' and 'cookie domain' columns
name_encoder.fit(cookie_details_df['cookie name'])
domain_encoder.fit(cookie_details_df['cookie domain'])

# Define a function to handle unknown labels
def encode_with_unknown(encoder, values):
    # Encode known values
    known_labels = list(encoder.classes_)
    return np.array([encoder.transform([val])[0] if val in known_labels else -1 for val in values])


# Encode the 'cookie name' and 'cookie domain' columns using the previously fitted LabelEncoders
cookie_details_df['name_encoded'] = encode_with_unknown(name_encoder, cookie_details_df['cookie name'])
cookie_details_df['domain_encoded'] = encode_with_unknown(domain_encoder, cookie_details_df['cookie domain'])

# Drop the original 'cookie name' and 'cookie domain' columns
cookie_details_df = cookie_details_df.drop(columns=['cookie name', 'cookie domain'])

# Drop the unnecessary index column if present
cookie_details_df = cookie_details_df.drop(columns=['Unnamed: 0'], errors='ignore')

# Check the processed data
print("Processed data preview:\n", cookie_details_df.head())

# Save the processed DataFrame if needed
cookie_details_df.to_csv('/content/processed_cookie_details.csv', index=False)


Column names in the dataset:
 Index(['cookie name', 'cookie domain'], dtype='object')
Missing values before preprocessing:
 cookie name      52
cookie domain     0
dtype: int64
Missing values after preprocessing:
 cookie name      0
cookie domain    0
dtype: int64
Processed data preview:
    name_encoded  domain_encoded
0         14987           10051
1         14039               9
2          4052               9
3         18632           10051
4         12817            7291


In [None]:
# Predict the labels
X_new = cookie_details_df.drop(columns=['declared_label'], errors='ignore')  # Ensure 'declared_label' is not present in this dataset
y_pred_new = mnb.predict(X_new)

In [None]:
# Add predictions to the dataframe
cookie_details_df['cookie name'] = name_encoder.inverse_transform(cookie_details_df['name_encoded'])
cookie_details_df['cookie domain'] = domain_encoder.inverse_transform(cookie_details_df['domain_encoded'])
cookie_details_df['predicted_label'] = y_pred_new

In [None]:
# Save the results to a new CSV file
cookie_details_df.to_csv('/content/predicted_cookie_details.csv', index=False)