In [3]:
import pandas as pd
import numpy as np

# Read and explore data

In [4]:
df = pd.read_csv('data/clickdata.csv', delimiter='\u0001')

In [5]:
df.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,useragent,ua_device_class,ua_device_name,...,ua_facebook_device_name,ua_facebook_device_version,ua_facebook_operating_system_name,ua_facebook_operating_system_version,ua_hacker_attackVector,ua_hacker_toolkit,ua_ie_compatibility_version_major,ua_ie_compatibility_name_version_major,ua_carrier,ua_agent_class
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Mozilla/5.0 (compatible; Googlebot/2.1; +http:...,Robot,Google,...,,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Robot
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Mozilla/5.0 (compatible; Googlebot/2.1; +http:...,Robot,Google,...,,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Robot
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47....,Desktop,Linux Desktop,...,,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Browser
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Mozilla/5.0 (compatible; Googlebot/2.1; +http:...,Robot,Google,...,,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Robot
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,Robot Mobile,Google,...,,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Robot Mobile


Each row in 'df' contains a page request in a session.

CSV column definitions:
* epoch_ms: epoch in milliseconds
* session_id: session identifier
* country_by_ip_address: estimated country based on GeoIP lookup
* region_by_ip_address: estimated region based on GeoIP lookup
* url_without_parameters: 
* referrer_without_parameters: 
* visitor_recognition_type: ANONYMOUS, RECOGNIZED (by cookie) or LOGGEDIN
* useragent: the useragent string sent by the webbrowser
* ua_... : we've parsed the useragent field (with yauaa) and expose it as ua_.... see https://github.com/nielsbasjes/yauaa#values-explained
* ua_agent_class: the class determined by yauaa


In [6]:
# clean up different types of missing values
df = df.replace(np.nan, '', regex=True)
df = df.replace('Unknown', '', regex=True)

In [7]:
df.index

RangeIndex(start=0, stop=59782, step=1)

In [83]:
# lets look at some of the columns
df['visitor_recognition_type'].unique()

array(['ANONYMOUS', 'LOGGEDIN', 'RECOGNIZED'], dtype=object)

In [84]:
df['country_by_ip_address'].unique()

array(['US', 'IT', 'NL', 'BE', '', 'UA', 'FR', 'DE', 'PL', 'CN', 'IE',
       'RU', 'GB', 'AT', 'HU', 'JP', 'CA', 'PT', 'ES', 'CH', 'LT', 'ID',
       'IN', 'TR', 'IR', 'MY', 'NZ', 'AU', 'TH', 'BD', 'QA', 'CZ', 'VN',
       'MN', 'IL', 'FI', 'AM', 'DK', 'SR', 'GR', 'SE', 'LV', 'PK', 'LU',
       'MA', 'MD', 'BG', 'BR', 'HR', 'AR', 'AL', 'MK', 'GH', 'PY', 'NO',
       'RO', 'BO', 'ZA', 'SO', 'MC', 'MX', 'KR', 'DO', 'CW', 'SK', 'KG'],
      dtype=object)

In [85]:
# Interesting values are 'Robot' and 'Browser' (not a robot)
print(df['ua_agent_class'].unique())

['Robot' 'Browser' 'Robot Mobile' 'Browser Webview' 'Hacker' 'Special'
 'Mobile App' 'Cloud Application']


In [86]:
# Reduce the amount of detail in classes
# Merge all different Human types
df['ua_agent_class'] = df['ua_agent_class'].str.replace('Browser Webview','Browser')
# Merge all different 'non hunam' types
df['ua_agent_class'] = df['ua_agent_class'].str.replace('Robot Mobile','Robot')
print(df['ua_agent_class'].unique())

['Robot' 'Browser' 'Hacker' 'Special' 'Mobile App' 'Cloud Application']


# Train a model

In [71]:
import pandas as pd
# select a few columns and transform them into features
X = pd.get_dummies(data=df[['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type']], drop_first=True)
y = df['ua_agent_class']

In [72]:
# naively split the data and train a model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [73]:
# pick an existing classifier algorithm
from sklearn.neighbors import KNeighborsClassifier
my_classifier = KNeighborsClassifier(n_jobs=-1)

In [74]:
my_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform')

# Evaluate the model

In [75]:
# model score
my_classifier.score(X_test, y_test)

0.9682858289843437

In [76]:
from sklearn.metrics import confusion_matrix
y_pred = my_classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[9282,    0,    0,   65,    0],
       [ 127,    0,    0,  201,    0],
       [   3,    0,    0,    0,    0],
       [  51,    0,    0, 5188,    0],
       [  20,    0,    0,    7,    2]])

# Predict a single element

In [77]:
# predict an individual data record
y_pred = my_classifier.predict([X_test.iloc[42]])[0]
y_real = y_test.iloc[42]
print(y_pred)
print(y_real)

Robot
Robot


In [1]:
df.index

NameError: name 'df' is not defined