The goal of this notebook is to train a random_forest classifier on our cleaned dataset.

Here is a link to a data dictionary (which explains what the column names mean): https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object.

See notebooks 4, 5, and 6 from the Classification lectures.

- [x] Implement a basic train-test split
- [x] Sample the data for training
- [x] Train one model
- [x] Quantify the model's accuracy
- [x] Monkey with hyperparameters, etc.
- [x] Make a more sophisticated train-test split
- [ ] Implement K-fold cross validation
- [ ] Create a proper scikit-learn pipeline
- [ ] Make some decent visualizations
- [ ] Add the categorical variables to dataframe (starting by making binary categories into boolean variables)
- [ ] One-hot encode the categorical variables, if necessary
- [ ] Examine feature importance
- [ ] Examine AUC

## Front matter

In [90]:
from pathlib import Path # For navigating to the datafiles
import json # For processing json files
import csv # For processing csv files

from functools import reduce # So Chris can pretend he's a CS major

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt # For simple graphing
import seaborn as sns # For sns.set_style("whitegrid"), I guess?

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

## Prepare data

In [3]:
# Read "well_formatted_json_files.txt"

data_files_to_read_path = Path('well_formatted_json_files.txt')
with open(data_files_to_read_path) as file:
    lines = file.read().splitlines()
    # Remove commented lines and blank lines:
    lines = [li for li in lines if ((li[0] != '#') and ( li.strip()))]
    print(len(lines))

10


In [4]:
# Read in json and tsv files

data_path = Path('data')

l = list()
for li in lines:
    # Read into dataframes
    
    json_path = data_path / Path(li + '_tweets.json')
    with open(json_path) as file:
        very_raw_json_contents = json.load(file)
    raw_json_contents = [x['user'] for x in very_raw_json_contents]
    json_contents = pd.DataFrame(raw_json_contents)
    tsv_path = data_path / Path(li + '.tsv')
    tsv_contents = pd.read_csv(tsv_path, sep='\t')
    tsv_contents.columns = ['user_id','species']
    
    # Set indexes to user ids and remove duplicate indices
        
    json_contents = json_contents.set_index('id')
    tsv_contents = tsv_contents.set_index('user_id')
    json_contents = json_contents.loc[~json_contents.index.duplicated(keep='last')]
    tsv_contents = tsv_contents.loc[~tsv_contents.index.duplicated(keep='last')]
    
    # Merge
    
    merged_data = pd.concat([json_contents, tsv_contents], join='inner', axis = 1)
    merged_data = merged_data.assign(source=li)
    
    l.append(merged_data)


cumulative_raw_data = reduce(lambda x, y : pd.concat([x,y]), l)

In [5]:
columns_to_keep = ['has_extended_profile','profile_use_background_image','profile_background_image_url_https','verified','profile_text_color','profile_sidebar_fill_color','followers_count','profile_sidebar_border_color','profile_background_color','listed_count','utc_offset','statuses_count','profile_link_color','geo_enabled','profile_background_image_url','lang','profile_background_tile','favourites_count','url','created_at','time_zone','default_profile','friends_count']

target = cumulative_raw_data['species'] == 'bot'
# old code: target = np.asarray([cumulative_raw_data['species'] == 'bot']).reshape(-1,1)
source = cumulative_raw_data['source']
cut_raw_data = cumulative_raw_data[columns_to_keep]

cut_raw_data.sample(3)

Unnamed: 0,has_extended_profile,profile_use_background_image,profile_background_image_url_https,verified,profile_text_color,profile_sidebar_fill_color,followers_count,profile_sidebar_border_color,profile_background_color,listed_count,...,geo_enabled,profile_background_image_url,lang,profile_background_tile,favourites_count,url,created_at,time_zone,default_profile,friends_count
852512004,False,True,https://abs.twimg.com/images/themes/theme1/bg.png,False,333333,DDEEF6,17,C0DEED,C0DEED,0,...,True,http://abs.twimg.com/images/themes/theme1/bg.png,pl,False,117,,Sat Sep 29 08:52:13 +0000 2012,Brussels,True,0
124735548,False,True,https://abs.twimg.com/images/themes/theme1/bg.png,False,333333,DDEEF6,118,C0DEED,C0DEED,5,...,False,http://abs.twimg.com/images/themes/theme1/bg.png,en,False,1984,,Sat Mar 20 11:32:59 +0000 2010,,True,352
120557765,False,True,https://abs.twimg.com/images/themes/theme9/bg.gif,True,666666,252429,2872283,181A1E,1A1B1F,8221,...,False,http://abs.twimg.com/images/themes/theme9/bg.gif,en,False,376,https://t.co/rJixLu4cJy,Sat Mar 06 20:33:05 +0000 2010,Alaska,False,1816


In [117]:
# Keep numerical and boolean attributes. Throw out the categorical, etc.

numerical_cols = ['favourites_count','statuses_count','friends_count','followers_count','listed_count']
boolean_cols = ['profile_background_tile','default_profile','geo_enabled','verified','has_extended_profile','profile_use_background_image']

assert set(numerical_cols) <= set(cut_raw_data.columns)
assert set(boolean_cols) <= set(cut_raw_data.columns)

cleaned_data = cut_raw_data[numerical_cols + boolean_cols]

display(cleaned_data.sample(3, random_state=855))
print(cleaned_data.shape)

Unnamed: 0,favourites_count,statuses_count,friends_count,followers_count,listed_count,profile_background_tile,default_profile,geo_enabled,verified,has_extended_profile,profile_use_background_image
1870916851,132,30,0,1,0,False,True,False,False,False,True
340052871,2497,7452,183,199,0,True,False,True,False,False,True
156031870,6635,19851,146,259,4,True,False,True,False,True,True


(44595, 11)


In [118]:
# Encode and add one or two more variables

has_url = ~cut_raw_data.url.isnull()
cleaned_data = cleaned_data.assign(has_url=has_url)

is_eng = cut_raw_data.lang == 'en'
cleaned_data = cleaned_data.assign(is_eng=is_eng)

cleaned_data = cleaned_data.assign(default_link_clr=(cumulative_raw_data.profile_link_color == '1DA1F2'))

cleaned_data = cleaned_data.assign(default_text_clr=(cumulative_raw_data.profile_text_color == '333333'))

theme1 = [(x is not None and 'theme1' in x) for x in cumulative_raw_data.profile_background_image_url]
themeNone = [x is None for x in cumulative_raw_data.profile_background_image_url]
cleaned_data = cleaned_data.assign(bkground_theme1=theme1)
cleaned_data = cleaned_data.assign(bkground_none=themeNone)

cleaned_data = cleaned_data.assign(bkground_C0DEED=cumulative_raw_data.profile_background_color=='C0DEED')
cleaned_data = cleaned_data.assign(bkground_F5F8FA=cumulative_raw_data.profile_background_color=='F5F8FA')
cleaned_data = cleaned_data.assign(bkground_000000=cumulative_raw_data.profile_background_color=='000000')

print('measuring description lengths')
cleaned_data = cleaned_data.assign(desc_len=[len(x) for x in cumulative_raw_data['description']])
print('done measuring description lengths')

print('measuring username lengths')
cleaned_data = cleaned_data.assign(name_len=[len(x) for x in cumulative_raw_data['name']])
print('done measuring username lengths')

measuring description lengths
done measuring description lengths
measuring username lengths
done measuring username lengths


In [121]:
cleaned_data.bkground_000000.value_counts().head()

False    41062
True      3533
Name: bkground_000000, dtype: int64

NUMERICAL:

* favourites_count
* statuses_count
* friends_count
* followers_count
* listed_count
* desc_len
* name_len

BOOLEAN:

* profile_background_tile
* default_profile [boolean]
* geo_enabled
* verified
* has_extended_profile
* profile_use_background_image

CATEGORICAL (ADDED):

* url (added as boolean "has_url")
* lang (added as boolean "is_eng")
* profile_link_color [categorical: 1DA1F2 or other]
* profile_text_color [consider categorical: either equal to 333333 or not]
* bkground_theme1 (profile_background_image_url [categorical: theme1, NaN, or other])
* bkground_none (profile_background_image_url [categorical: theme1, NaN, or other])
* profile_background_color [categorical: equal to C0DEED, F5F8FA, 000000, or other]
 * bkground_C0DEEED
 * bkground_F5F8FA
 * bkground_000000

CATEGORICAL (NOT YET ADDED):

* profile_background_image_url_https [consider categorical: equal to theme1, theme14, NaN, or other]
* profile_sidebar_border_color [consider categorical: either equal to C0DEED, FFFFFF, 000000, or other]
* profile_sidebar_fill_color [consider categorical: either equal to DDEEF6 or not]

WEIRD/OTHER (NOT GOING TO BE ADDED)

* utc_offset [categorical? UTC offset measured in seconds]
* created at [look at day or week, or at time of day]
* time_zone [categorical: NaN or other]

In [83]:
cleaned_data.has_url.value_counts()

False    31969
True     12626
Name: has_url, dtype: int64

## Train model

The train-test split stratifies by target right now, and not by source. We should stratify by both. (Note to Chris: Think of stratifying by a source-target ordered pair.)

In [71]:
# This is an array of ordered pairs. Each pair combines the bot/human value of
# an account with the source dataset from which the account came. We need this
# so that train_test_split can stratify by bot/human value AND data source.

# This codeblock takes ten seconds to run on my PC.

stratify_guide = np.asarray([(target[acct],source[acct]) for acct in cleaned_data.index]).reshape(-1,1)

In [72]:
# Train test split

X_train,X_test,y_train,y_test = train_test_split(cleaned_data[['favourites_count']],
                                                target,
                                                train_size=0.75,
                                                shuffle=True,
                                                stratify=target,
                                                random_state=855)

I tried hyperparameter tuning of `rf_model` but I couldn't really get the accuracy to budge. Adding more attributes to the training data would probably help.

In [75]:
rf_model = RandomForestClassifier(n_estimators = 100,
                                  max_features = 'auto',
                                  criterion='entropy')
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Test model

y is True for bots, False for humans

In [76]:
pred = rf_model.predict(X_test)
print('overall accuracy:', np.sum(pred == y_test)/len(y_test))
print()

for src in set(source.values):
    print(src)
    print('accuracy:', np.sum( (pred == y_test) & (source == src) )/source.value_counts()[src])
    print()

overall accuracy: 0.7233832630729213

vendor-purchased-2019
accuracy: 0.16022099447513813

verified-2019
accuracy: 0.1691842900302115

botwiki-2019
accuracy: 0.18794835007173602

cresci-rtbust-2019
accuracy: 0.12572254335260116

celebrity-2019
accuracy: 0.11441608923440932

cresci-stock-2018
accuracy: 0.14847457627118643

pronbots-2019
accuracy: 0.23896873776634417

botometer-feedback-2019
accuracy: 0.1276595744680851

political-bots-2019
accuracy: 0.22950819672131148

gilani-2017
accuracy: 0.15787354007249296

