# Is it Trump?

President of the United States Donald Trump has a very perticular style of tweeting. This machine learning model aims to assess whether a tweet is Trump's or not. 

In [1]:
import pandas as pd
import numpy as np
import re

# 1. Import data

### A. Load datasets

In [2]:
# Tweets by trump

relevant_columns = ["content", "trump"]

trump = pd.read_csv("data/trump.csv")
trump['trump'] = 1
trump = trump[relevant_columns]
trump.head(5)

Unnamed: 0,content,trump
0,Be sure to tune in and watch Donald Trump on L...,1
1,Donald Trump will be appearing on The View tom...,1
2,Donald Trump reads Top Ten Financial Tips on L...,1
3,New Blog Post: Celebrity Apprentice Finale and...,1
4,"""My persona will never be that of a wallflower...",1


In [3]:
# Other tweets. This a collection of various generic tweets dataset.

def load_nontrump_data(filepath, txt_col):
    df = pd.read_csv(filepath, error_bad_lines=False)
    df['trump'] = 0
    df.rename({txt_col: 'content'}, axis=1, inplace=True)
    df = df[relevant_columns]
    return df


filepaths = ['data/elonmusk.csv', "data/companytweets.csv", "data/financetweets.csv",
            "data/populartweets.csv"]
txt_columns = ["text", "tweet ", "text", "content"]
other_df = []

for filepath, txt_col in zip(filepaths, txt_columns):
    other_df.append(load_nontrump_data(filepath, txt_col))

b'Skipping line 3194: expected 8 fields, saw 17\nSkipping line 3205: expected 8 fields, saw 17\nSkipping line 3255: expected 8 fields, saw 17\nSkipping line 3520: expected 8 fields, saw 17\nSkipping line 4078: expected 8 fields, saw 17\nSkipping line 4087: expected 8 fields, saw 17\nSkipping line 4088: expected 8 fields, saw 17\nSkipping line 4499: expected 8 fields, saw 12\n'


### B. Build aggregate dataset

To balance classes, we create a dataset with 4000 random tweets from Trump and 1000 tweets from each other dataset. 

In [4]:
dataset = trump

for df in other_df:
    dataset = dataset.append(df)

In [5]:
share_trump = dataset['trump'].sum() / dataset.shape[0]

print(f"{100*share_trump}% of the data is Trump tweets")

27.934969617691973% of the data is Trump tweets


### C. Clean dataset

In [6]:
# We are only interested in writing style, so we remove links. 

def remove_links(txt):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', txt)

dataset['content'] = dataset['content'].map(remove_links)

In [7]:
dataset.sample(10)
dataset.shape

(155189, 2)

### D. Train/Test split

In [8]:
from sklearn.model_selection import train_test_split

dataset_train, dataset_test = train_test_split(dataset, train_size=0.8)

In [9]:
dataset.to_csv('dataset_3.csv')

## 2. Baseline model

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# n-gram range 2,10 + xgb => 0.9765
# n-gram range 2,6 + xgb => 0.9742
# n-gram range 3,6 + xgb => 
# n-gram range 2,5 + rf => 

def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)


log = Pipeline([
    ('vectorize', FeatureUnion(
        [
        ('len', FunctionTransformer(get_text_length, validate=False)),
        ('count_characters', TfidfVectorizer(lowercase=False, ngram_range=(2,3), analyzer='char_wb', min_df=3, binary=False, dtype=np.float32)),
        ('tfidf_words', TfidfVectorizer(lowercase=True, ngram_range=(1,1), analyzer='word', min_df=3, dtype=np.float32))
        ], 
    n_jobs=4)
    ),
    #('pca', TruncatedSVD(n_components=128)),
    ('xgboost', XGBClassifier(max_depth=16, n_estimators=100, base_score=share_trump, n_jobs=4))
   ], verbose=True)

log.fit(dataset['content'].to_list(), dataset['trump'])

dump(log, "tfidf_len_vocab_xgbc_6")
print('Done.')
# 0.98427

[Pipeline] ......... (step 1 of 2) Processing vectorize, total=  21.3s


KeyboardInterrupt: 

In [70]:
log.fit(dataset_train['content'].to_list(), dataset_train['trump'])
log.score(dataset_test['content'].to_list(), dataset_test['trump'])

[Pipeline] ......... (step 1 of 2) Processing vectorize, total=  17.2s
[Pipeline] ........... (step 2 of 2) Processing xgboost, total= 8.0min


0.9821509117855531

In [56]:
log.score(dataset_train['content'].to_list(), dataset_train['trump'])

0.9899235608251242

In [75]:
log.predict_proba(["""
Hello everyone. I am writing something. I hope this is not too trump-like.
"""])[:,1]
# close to 1 = probably trump

array([0.0743595], dtype=float32)

In [72]:
log.predict_proba(["""
Yo who wants to play golf tonight
"""])[:,1]
# far from 1 = probably not trump

array([0.00809272], dtype=float32)

In [49]:
len(log.steps[0][1].transformer_list[0][1].vocabulary_)

726538

In [50]:
log.steps[1][1]

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [11]:
%%time
from joblib import dump 
log.fit(dataset['content'].to_list(), dataset['trump'])
print('Saving')


[Pipeline] ......... (step 1 of 2) Processing vectorize, total=  22.9s
[Pipeline] ........... (step 2 of 2) Processing xgboost, total=10.2min
Saving
CPU times: user 39min 49s, sys: 1.28 s, total: 39min 51s
Wall time: 10min 32s


PicklingError: Can't pickle <function get_text_length at 0x7fe080cb1b00>: it's not the same object as __main__.get_text_length

## 3. BERT model

### A. Precompute BERT \[CLS\] embeddings

In [None]:
from transformers import AutoTokenizer, BertModel

# Load BERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = BertModel.from_pretrained('distilbert-base-cased')

In [None]:
import dask.dataframe as dd

def get_bert_embedding(txt: str) -> np.array:
    tokens = tokenizer(txt, return_tensors="pt", padding=True, truncation=True)
    tensor = model(**tokens)[0][:,0,:]
    return tensor.detach().numpy()

def multithread_apply(df):
    # Use dask to multithread the computation
    ddata = dd.from_pandas(df, npartitions=8)
    res = ddata.map_partitions(lambda df: df.apply((lambda row: get_bert_embedding(row['content'])), axis=1)).compute(scheduler='threads')
    return res

In [None]:
data_train_embeddings = pd.read_csv("data/data_train.zip")

In [None]:
data_train_embeddings = multithread_apply(data_train)
data_train_embeddings.to_csv('data/data_train.zip')

In [None]:
#data_train_embeddings.name = 'embedding'
#data_train = data_train[relevant_columns].join(data_train_embeddings)
#data_train_embeddings.columns = ['index', 'embedding']
#data_train_embeddings.set_index('index', inplace=True)
#data_train_embeddings
data_train

In [None]:
data_test_embeddings = pd.read_csv("data/data_test.zip")

In [None]:
data_test_embeddings = multithread_apply(data_test)
data_test_embeddings.to_csv('data/data_test.zip')

In [None]:
data_test_embeddings.name = 'embedding'
data_test = data_test[relevant_columns].join(data_test_embeddings)

In [None]:
# Convert all to np.array

X_train = np.array(data_train['embedding'].to_list()).reshape(-1, 768)
#X_test = np.array(data_test['embedding'].to_list()).reshape(-1, 768)

y_train = np.array(data_train['trump'])
#y_test = np.array(data_test['trump'])

### B. Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier

#log = LogisticRegressionCV(Cs=20, fit_intercept=False, max_iter=10000, scoring="f1")
log = MLPClassifier(hidden_layer_sizes=(16,8,4), max_iter=10000)
log.fit(X_train, y_train)
log.score(X_test, y_test)

### C. Test

In [None]:
def is_it_trump(txt):
    embedding = get_bert_embedding(txt)
    return log.predict_proba(embedding)[:,1]

txt = """
IT’S HERE! IT’S REALLY HERE! 
🙃
 I finally got back my smile! Hope this record puts one on your face 
🙂
 #SMILE
 
🙂
 IS OUT EVERYWHERE NOW! LOVE YOU GUYS SO MUCH ENJOY 
🤡
♥️
 (sent from my hospital bed lol)
 """
is_it_trump(txt)