In [9]:
import pandas as pd
from pathlib import Path

DATA_DIR = f"{Path().resolve(strict=True).parent}/data/"
df = pd.read_parquet(f"{DATA_DIR}/interim/country_codes_as_int.parquet")

In [10]:
df = df[["http_user_agent","is_bot"]]
df.head()

Unnamed: 0,http_user_agent,is_bot
0,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,False
1,Mozilla/5.0 (iPhone; CPU iPhone OS 15_3_1 like...,False
2,Mozilla/5.0 (iPhone; CPU iPhone OS 15_3_1 like...,False
3,Mozilla/5.0 (iPhone; CPU iPhone OS 15_3_1 like...,False
4,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,False


In [11]:
bot = df[df["is_bot"] == True]["http_user_agent"].iloc[0]

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(df["http_user_agent"], df["is_bot"], test_size=0.2, random_state=3)

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vect = CountVectorizer(ngram_range=(1,2))
tfidf = TfidfTransformer(use_idf=True)

In [16]:
text_clf_sdg = Pipeline([('vect', vect),
                     ('tfidf', tfidf),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                  alpha=1e-3, random_state=42)),
])
text_clf_sdg = text_clf_sdg.fit(X_train, Y_train)

In [17]:

text_clf_mnb = Pipeline([('vect', vect),
                     ('tfidf', tfidf),
                     ('clf', MultinomialNB(alpha=1e-3)),
])
text_clf_mnb = text_clf_mnb.fit(X_train, Y_train)

In [18]:
from sklearn.model_selection import GridSearchCV
gs_clf_sdg = GridSearchCV(text_clf_sdg, {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),}, n_jobs=-1)
gs_clf_mnb = GridSearchCV(text_clf_mnb, {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),}, n_jobs=-1)
gs_clf_mnb = gs_clf_mnb.fit(X_train, Y_train)
gs_clf_sdg = gs_clf_sdg.fit(X_train, Y_train)

In [19]:
import numpy as np

In [20]:
predicted_clf_sdg = text_clf_sdg.predict(X_test)
np.mean(predicted_clf_sdg == Y_test)

0.9913504590475329

In [21]:
text_clf_sdg.score(X_test, Y_test)

0.9913504590475329

In [22]:
predicted_clf_mnb = text_clf_mnb.predict(X_test)
np.mean(predicted_clf_mnb == Y_test)

0.9793202478404414

In [23]:
text_clf_mnb.score(X_test, Y_test)

0.9793202478404414

In [24]:
t = df["http_user_agent"][:2]
t[1]

'Mozilla/5.0 (iPhone; CPU iPhone OS 15_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Mobile/15E148 Safari/604.1'

In [25]:
predicted_clf_mnb = text_clf_mnb.predict(t)
predicted_clf_sdg = text_clf_sdg.predict(t)
print(
    "predicted_clf_mnb:",predicted_clf_mnb,
    "predicted_clf_sdg:",predicted_clf_sdg,
)

predicted_clf_mnb: [False False] predicted_clf_sdg: [ True False]


In [26]:
predicted_clf_mnb = text_clf_mnb.predict([bot])
predicted_clf_sdg = text_clf_sdg.predict([bot])
print(
    "predicted_clf_mnb:",predicted_clf_mnb,
    "predicted_clf_sdg:",predicted_clf_sdg,
)

predicted_clf_mnb: [ True] predicted_clf_sdg: [ True]


Test and quantify the resutls with a Confusion Matrix