In [26]:
# !IMPORTANT - to run this notebook download first "corpus" dataset from 
# dataset https://gist.github.com/kunalj101/ad1d9c58d338e20d09ff26bcc06c4235 and put it in same folder as for this notebook
# then import wiki-news-300d-1M.vec from https://www.kaggle.com/datasets/facebook/fasttext-wikinews

In [27]:
#import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import ensemble
from keras import layers, models, optimizers
import numpy
import re

trainDF = pd.read_csv('data/indexing_bots.csv', sep='\t')
trainDF

Unnamed: 0,count,date,ip,user_agent,is_indexing_bot
0,1,2022-01-01,183.136.225.56,Baiduspider+(+http://www.baidu.com/search/spid...,1
1,322967,2022-01-01,176.10.98.242,CompanyHouse-PageFetcher/1.0,1
2,665,2022-01-01,66.249.64.4,Googlebot-Image/1.0,1
3,436,2022-01-01,66.249.64.5,Googlebot-Image/1.0,1
4,315,2022-01-01,66.249.64.6,Googlebot-Image/1.0,1
...,...,...,...,...,...
1841,2,2022-01-02,2a01:c23:7025:ec00:3ce7:e2c4:3cb2:464f,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,0
1842,2,2022-01-02,2a02:908:616:1b80:1009:ef9b:1073:a888,Mozilla/5.0 (iPhone; CPU iPhone OS 15_1 like M...,0
1843,1,2022-01-02,2a02:6d40:34c0:9401:54a8:698d:a68c:d9ab,Mozilla/5.0 (iPhone; CPU iPhone OS 15_1 like M...,0
1844,2,2022-01-02,2a02:908:e51:d1a0:5045:44d5:a77f:bf56,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,0


In [28]:
# analyze dataset
# check number of words in each review
train = trainDF.copy()
train['word_count'] = train['user_agent'].apply(lambda x: len(str(x).split(" ")))
print(train[['user_agent','word_count']].head())
print(train['word_count'].max())
print(train['word_count'].min())
print(train['word_count'].mean())

                                          user_agent  word_count
0  Baiduspider+(+http://www.baidu.com/search/spid...           1
1                       CompanyHouse-PageFetcher/1.0           1
2                                Googlebot-Image/1.0           1
3                                Googlebot-Image/1.0           1
4                                Googlebot-Image/1.0           1
20
1
10.394907908992415


In [29]:
# check character number
train['char_count'] = train['user_agent'].str.len() ## this also includes spaces
train[['user_agent','char_count']].head()

Unnamed: 0,user_agent,char_count
0,Baiduspider+(+http://www.baidu.com/search/spid...,161.0
1,CompanyHouse-PageFetcher/1.0,28.0
2,Googlebot-Image/1.0,19.0
3,Googlebot-Image/1.0,19.0
4,Googlebot-Image/1.0,19.0


In [35]:
train['user_agent'] = train['user_agent'].apply(lambda x: str(x))
# check average word length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['user_agent'].apply(lambda x: avg_word(x))
train[['user_agent','avg_word']].head()

Unnamed: 0,user_agent,avg_word
0,Baiduspider+(+http://www.baidu.com/search/spid...,161.0
1,CompanyHouse-PageFetcher/1.0,28.0
2,Googlebot-Image/1.0,19.0
3,Googlebot-Image/1.0,19.0
4,Googlebot-Image/1.0,19.0


In [36]:
# check numeric characters number
train['numerics'] = train['user_agent'].apply(lambda x: len([x for x in x.split(' ') if x.isnumeric()]))
print(train[['user_agent','numerics']].head())
print(train['numerics'].max())
print(train['numerics'].min())
print(train['numerics'].mean())

                                          user_agent  numerics
0  Baiduspider+(+http://www.baidu.com/search/spid...         0
1                       CompanyHouse-PageFetcher/1.0         0
2                                Googlebot-Image/1.0         0
3                                Googlebot-Image/1.0         0
4                                Googlebot-Image/1.0         0
1
0
0.0021668472372697724


In [37]:
# check number of upper case letters
train['upper'] = train['user_agent'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
print(train[['user_agent','upper']].head())
print(train['upper'].max())
print(train['upper'].min())
print(train['upper'].mean())

                                          user_agent  upper
0  Baiduspider+(+http://www.baidu.com/search/spid...      0
1                       CompanyHouse-PageFetcher/1.0      0
2                                Googlebot-Image/1.0      0
3                                Googlebot-Image/1.0      0
4                                Googlebot-Image/1.0      0
7
0
2.26056338028169


In [38]:
# check number of punctuation characters
train['punctuation'] = train['user_agent'].apply(lambda x: len(re.findall(r'[\.,?!;:]', x)))
print(train['punctuation'].max())
print(train['punctuation'].min())
print(train['punctuation'].mean())

27
0
7.399783315276273


In [41]:
# normalize text
# apply lowercase
trainDFRaw = trainDF.copy()
trainDF['user_agent'] = trainDF['user_agent'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
trainDF['user_agent'].head()

0    baiduspider http www baidu com search spider h...
1                         companyhouse pagefetcher 1 0
2                                  googlebot image 1 0
3                                  googlebot image 1 0
4                                  googlebot image 1 0
Name: user_agent, dtype: object

In [42]:
# remove special characters
trainDF['user_agent'] = trainDF['user_agent'].str.replace('[^\w\s]',' ')
trainDF['user_agent'].head()

  trainDF['user_agent'] = trainDF['user_agent'].str.replace('[^\w\s]',' ')


0    baiduspider http www baidu com search spider h...
1                         companyhouse pagefetcher 1 0
2                                  googlebot image 1 0
3                                  googlebot image 1 0
4                                  googlebot image 1 0
Name: user_agent, dtype: object

In [43]:
# prepare model
# split data to train and test set
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['user_agent'], trainDF['is_indexing_bot'])

# encode categorical values
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

print(valid_y)

[1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0
 1 1 0 1 1 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 0
 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0
 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1
 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0
 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0
 1 0 1 0 0 0 1 1 1 1 0 0 0 1 0 1 1 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 0 0
 1 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 1 1 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1
 1 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1
 0 1 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0]


In [48]:
train_x[100]


'mozilla 5 0 compatible googlebot 2 1 http www google com bot html'

In [47]:
# encode text to TF-IDF numeric vectors
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['user_agent'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)


In [61]:
print(xtrain_tfidf[100])
len(tfidf_vect.get_feature_names_out())

  (0, 561)	0.1591449041273468
  (0, 506)	0.11736410662411167
  (0, 477)	0.15902532626640176
  (0, 449)	0.09066281615737472
  (0, 437)	0.2877033519237585
  (0, 436)	0.1591449041273468
  (0, 415)	0.1171544026990769
  (0, 404)	0.11729414257710087
  (0, 386)	0.2877033519237585
  (0, 353)	0.10926106377202364
  (0, 291)	0.18159666490288312
  (0, 270)	0.11729414257710087
  (0, 217)	0.20087939165084964
  (0, 174)	0.35419881740457276
  (0, 167)	0.09066281615737472
  (0, 164)	0.20108413542470366
  (0, 131)	0.3635129400147
  (0, 31)	0.3159365475507318
  (0, 26)	0.4195823909034112
  (0, 0)	0.16700018691103335


576

In [62]:
# universal method for model training

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # train model
    classifier.fit(feature_vector_train, label)
    
    # generate predictions for test set
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    # evaluate model
    scores = list(metrics.precision_recall_fscore_support(predictions, valid_y))
    score_vals = [
        scores[0][0],
        scores[1][0],
        scores[2][0]
    ]
    score_vals.append(metrics.accuracy_score(predictions, valid_y))
    return score_vals

In [69]:
# MODEL - Random Forest Tree 
import time

ut = time.time()
print(len(train_x))
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
accuracy_compare = accuracy
print ("RF, WordLevel TF-IDF: ", accuracy)
print(time.time() - ut)

1384
RF, WordLevel TF-IDF:  [0.9935275080906149, 1.0, 0.9967532467532468, 0.9956709956709957]
0.13626670837402344
