In [None]:
import pandas as pd
import csv
import datetime
import pickle
import os
import numpy as np
import sys
import time
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import string
from string import punctuation
from langdetect import detect
from scipy.sparse import hstack
from scipy import sparse
from xgboost import XGBClassifier

In [None]:
df = pd.read_pickle('modeling_df_final.pkl')

In [None]:
drop_cols = ['user_id','is_bot','screen_name','text','hashtags','links','timestamp','is_quote_tweet','date_joined']

In [None]:
comparison = df[df['following'] < 12000] #df[(df['followers'] < 12000) & (df['following'] < 12000)]
print(comparison.shape)
print(comparison['is_bot'].value_counts())

y = comparison['is_bot']
X = comparison.drop(drop_cols, axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X.values ,y , test_size = .33, random_state = 42)

In [None]:
X.head()

# Modeling with No Text

In [None]:
df.columns

In [None]:
modeling_df = df.groupby(['user_id','is_bot']).agg({'timestamp':['min'],
                                                    'followers':'mean',
                                                    'following':'mean',
                                                    'is_retweet':'sum',
                                                    'is_quote_tweet':'sum',
                                                    'upper_count':'sum', 
                                                    'char_count':'sum',
                                                    'word_count':'sum', 
                                                    'hashtag_count':'sum',
                                                    'screen_name':'count'}).reset_index()


In [None]:
modeling_df.columns = modeling_df.columns.droplevel(1)

In [None]:
modeling_df['posts_during_period'] = modeling_df['screen_name']
modeling_df.drop('screen_name', axis = 1, inplace = True)
modeling_df['ratio_old'] = modeling_df['followers']/(modeling_df['following']+0.0000001)
modeling_df['ratio'] = modeling_df['ratio_old'].fillna(0)
modeling_df.drop(['ratio_old'], axis = 1, inplace = True)
modeling_df['ratio'] = round(modeling_df['ratio'],2)
modeling_df['average_char'] = modeling_df['char_count']/modeling_df['posts_during_period']
#modeling_df['average_upper_char'] = modeling_df['upper_count']/modeling_df['posts_during_period']

In [None]:
modeling_df.shape

In [None]:
modeling_df = modeling_df[modeling_df['followers']<12000]
modeling_df.shape

In [None]:
modeling_df['is_bot'].value_counts()

In [None]:
y = modeling_df['is_bot']
X = modeling_df.drop(['is_bot','user_id','timestamp', 'is_quote_tweet'], axis = 1)

In [None]:
features = X.columns

## No Text RF Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size = 0.2, random_state = 42)

rf_model = RandomForestClassifier(n_estimators=5,random_state = 42) #class_weight={0:1,1:100}
rf_model.fit(X_train, y_train)
accuracy_score(y_test,rf_model.predict(X_test))
precision_score(y_test,rf_model.predict(X_test))
f1_score(y_test,rf_model.predict(X_test))

In [None]:
precision_score(y_test,rf_model.predict(X_test))

In [None]:
rf_model.estimators_

In [None]:
accuracy_score(y_test,rf_model.predict(X_test))

In [None]:
recall_score(y_test,rf_model.predict(X_test))

In [None]:
# need to balance for classes

In [None]:
X.columns

In [None]:
rf_model.feature_importances_

In [None]:
y_test.sum()

In [None]:
rf_model.predict(X_test).sum()

## Logistic Model

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
f1_score(y_test,log_model.predict(X_test))

In [None]:
recall_score(y_test,log_model.predict(X_test))

In [None]:
accuracy_score(y_test,log_model.predict(X_test))

In [None]:
precision_score(y_test,log_model.predict(X_test))

## XGB Model No Text

In [None]:
xgb_model = XGBClassifier(max_depth = 3, learning_rate = 0.01, n_estimators = 1000, random_state=38)
xgb_model.fit(X_train, y_train)

In [None]:
pred = xgb_model.predict(X_test)

In [None]:
print(f1_score(y_test, pred))
print(recall_score(y_test, pred))
print(accuracy_score(y_test, pred))

In [None]:
print(precision_score(y_test, xgb_model.predict(X_test)))

In [None]:
feats = {} 
for feature, importance in zip(features, xgb_model.feature_importances_):
    feats[feature] = importance
    
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances = importances.sort_values(by='Gini-importance',ascending = False)

In [None]:
importances.head(15)

In [None]:
sns.scatterplot(x = 'following', y = 'followers', hue = 'is_bot', data = modeling_df);

In [None]:
sns.scatterplot(x = 'upper_count', y = 'ratio', hue = 'is_bot', data = modeling_df);

In [None]:
ax = sns.scatterplot(x = 'following', y = 'upper_count', hue = 'is_bot', data = modeling_df)
ax.set_ylabel('Upper Character Used')
ax.set_xlabel('Following');

In [None]:
ax = sns.scatterplot(x = 'following', y = 'posts_during_period', hue = 'is_bot', data = modeling_df)
ax.set_ylabel('Posts During Period');

In [None]:
print(xgb_model.predict(X_test).sum())
print(y_test.sum())

In [None]:
results = pd.DataFrame(X_test)
results.columns = features
results.shape

In [None]:
preds = pd.DataFrame(y_test-xgb_model.predict(X_test))
preds.reset_index(inplace=True)

In [None]:
preds.columns = ['index', 'predicted']

In [None]:
results.reset_index(inplace=True)
results
results = pd.concat([results, preds], axis =1)

In [None]:
import matplotlib.transforms

conf_matrix = confusion_matrix(y_test, pred)
ax = sns.heatmap(conf_matrix, cmap="OrRd", annot=True, fmt='g')