In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("/kaggle/input/dark-patterns-on-ecommerce-platforms/dark_patterns.csv")
df

Unnamed: 0,Pattern String,Comment,Pattern Category,Pattern Type,Where in website?,Deceptive?,Website Page
0,Collin P. from Grandview Missouri just bought ...,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://alaindupetit.com/collections/all-suits...
1,"Faith in Glendale, United States purchased a C...",Periodic popup,Social Proof,Activity Notification,Product Page,No,https://bonescoffee.com/products/strawberry-ch...
2,Sharmeen Atif From Karachi just bought Stylish...,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://brandsego.com/collections/under-rs-99/...
3,9 people are viewing this.,Product detail,Social Proof,Activity Notification,Product Page,No,https://brightechshop.com/products/ambience-so...
4,5338 people viewed this in the last hour,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://bumpboxes.com/
...,...,...,...,...,...,...,...
1813,$132.90 $99.00,Website adds free items to show discount,Misdirection,Visual Interference,Cart Page,No,https://www.planetofthevapes.com/products/plan...
1814,This offer is only VALID if you add to cart now!,Popup asking you to buy more,Misdirection,Visual Interference,Product Page,No,https://www.rockymountainoils.com/single-essen...
1815,,Deterministic draw. Always give you the prize ...,Misdirection,Visual Interference,Product Page,Yes,https://www.sammydress.com/
1816,,Shows you prices in the popup based on your cu...,Misdirection,Visual Interference,Product Page,No,https://www.shoedazzle.com/products/FEELIN-A-L...


In [3]:
features = df[['Comment', 'Where in website?']]
target_category = df['Pattern Category']
target_type = df['Pattern Type']

In [4]:
df['Pattern String'] = df['Pattern String'].replace(np.nan, 'Graphical')

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
text_features = tfidf_vectorizer.fit_transform(df['Pattern String'])

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
column_transformer = ColumnTransformer(
    transformers=[
        ('comment_and_website', OneHotEncoder(sparse =False), ['Comment', 'Where in website?'])
    ],
    remainder='passthrough'
)

categorical_features = column_transformer.fit_transform(features)



In [7]:
X = pd.concat([pd.DataFrame(text_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out()),
               pd.DataFrame(categorical_features, columns=column_transformer.get_feature_names_out(['Comment', 'Where in website?'])).reset_index(drop=True)], axis=1)

In [8]:
#X = pd.concat([pd.DataFrame(text_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out()),
              # features.reset_index(drop=True)], axis=1)

In [9]:
X

Unnamed: 0,00,0000,0011days,0088hours,00am,00days00hours14minutes21seconds,00days00hours47minutes36seconds,00days02hrs14mins1211sec,00days07hours09minutes39seconds,00days17hours12mins13secs,...,comment_and_website__Comment_You end up with a renewing ifit membership which is not disclosed. It only states that you receive a 1-year iFit membership,comment_and_website__Comment_You get free shipping over 75$ even without entering data in the popup,"comment_and_website__Comment_popup, below nav bar and product detail",comment_and_website__Comment_sample20off valid even after timer expires. Right side popup. Popup goes away upon expiration.,comment_and_website__Comment_nan,comment_and_website__Where in website?_Cart Page,comment_and_website__Where in website?_Checkout Process,comment_and_website__Where in website?_Home Page,comment_and_website__Where in website?_Other Page,comment_and_website__Where in website?_Product Page
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,0.406475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1814,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1815,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1816,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
from sklearn.preprocessing import LabelEncoder
label_category = LabelEncoder()
label_type = LabelEncoder()

y_category = label_category.fit_transform(target_category)
y_type = label_type.fit_transform(target_type)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_category_train, y_category_test, y_type_train, y_type_test = train_test_split(
    X, y_category, y_type, test_size=0.3, random_state=42
)

In [12]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#classifier = MultiOutputClassifier(LogisticRegression(max_iter=1000)) 
classifier = MultiOutputClassifier(RandomForestClassifier())
classifier.fit(X_train, pd.DataFrame({'category': y_category_train, 'type': y_type_train}))

In [13]:
y_category_pred, y_type_pred = zip(*classifier.predict(X_test))


In [14]:
#y_category_pred = label_category.inverse_transform(y_category_pred)
#y_type_pred = label_type.inverse_transform(y_type_pred)

In [15]:
from sklearn.metrics import accuracy_score
accuracy_category = accuracy_score(y_category_test, y_category_pred)
accuracy_type = accuracy_score(y_type_test, y_type_pred)



In [16]:
print(f'Accuracy - Category: {accuracy_category}')
print(f'Accuracy - Type: {accuracy_type}')

Accuracy - Category: 0.9267399267399268
Accuracy - Type: 0.9047619047619048


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_category_test, y_category_pred))
print(classification_report(y_type_test, y_type_pred))


              precision    recall  f1-score   support

           1       0.92      0.78      0.85        92
           2       1.00      0.91      0.95        11
           3       0.98      0.98      0.98       197
           4       0.50      0.10      0.17        10
           5       0.94      0.98      0.96        90
           6       0.85      0.97      0.91       146

    accuracy                           0.93       546
   macro avg       0.87      0.79      0.80       546
weighted avg       0.92      0.93      0.92       546

              precision    recall  f1-score   support

           0       0.94      0.99      0.97        86
           1       0.93      1.00      0.96        53
           2       0.80      0.94      0.87       122
           4       1.00      0.91      0.95        11
           5       1.00      0.50      0.67         2
           6       0.50      0.20      0.29         5
           7       0.92      0.86      0.89        14
           8       0.88 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
