In [None]:
import pandas as pd
import csv
import datetime
import pickle
import os
import numpy as np
import sys
import time
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import timezone
import string
from string import punctuation
from langdetect import detect
from scipy.sparse import hstack
from scipy import sparse
from xgboost import XGBClassifier
from pytz import utc, timezone

In [None]:
df = pd.read_pickle('modeling_df_final.pkl')

In [None]:
df.dtypes

In [None]:
df['text_cleaned'] = df['text_cleaned'].str.replace('aa', '')
df['text_cleaned'] = df['text_cleaned'].str.replace('aaa', '')

In [None]:
modeling_df = df.groupby(['user_id','is_bot']).agg({'timestamp':['min', 'max'],
                                                    'followers':'mean',
                                                    'following':'mean',
                                                    'is_retweet':'sum',
                                                    'is_quote_tweet':'sum',
                                                    'upper_count':'sum', 
                                                    'char_count':'sum',
                                                    'word_count':'sum', 
                                                    'hashtag_count':'sum',
                                                    'text_cleaned': lambda x: ' '.join(x),
                                                    'screen_name':'count'}).reset_index()
modeling_df.columns = modeling_df.columns.droplevel(1)
modeling_df['posts_during_period'] = modeling_df['screen_name']
modeling_df.drop(['timestamp','screen_name'], axis = 1, inplace = True)
modeling_df['ratio_old'] = modeling_df['followers']/(modeling_df['following']+0.0000001)
modeling_df['ratio'] = modeling_df['ratio_old'].fillna(0)
modeling_df.drop(['ratio_old'], axis = 1, inplace = True)
modeling_df['ratio'] = round(modeling_df['ratio'],2)
modeling_df['average_char'] = modeling_df['char_count']/modeling_df['posts_during_period']

In [None]:
count_vect = TfidfVectorizer(min_df = 4, max_df = .4, 
                             max_features = 35000,
                             ngram_range = (1,2),
                            stop_words = 'english') 
df_text = count_vect.fit_transform(modeling_df['text_cleaned'])

In [None]:
df_text.shape

In [None]:
X_sparse = modeling_df.drop(['user_id','text_cleaned','is_quote_tweet'], axis = 1)
text_features = count_vect.get_feature_names()
number_features = list(X_sparse.columns)
feature_names = text_features + number_features

In [None]:
X_sparse = sparse.csr_matrix(X_sparse.values.astype(np.float))

In [None]:
X_sparse.shape

In [None]:
X_final = hstack((X_sparse, df_text))
del(df_text, X_sparse)
y = modeling_df['is_bot']
X_train, X_test, y_train, y_test = train_test_split(X_final, y, 
                                                    test_size = 0.2, random_state = 42)

# Text RF Model

In [None]:
rf_model = RandomForestClassifier(n_estimators=5,random_state = 44) 
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)
print(accuracy_score(y_test,pred))
print(precision_score(y_test,pred))
print(f1_score(y_test,pred))

In [None]:
recall_score(y_test,pred)

In [None]:
pred.sum()

In [None]:
y_test.sum()

In [None]:
feats = {} 
for feature, importance in zip(feature_names, rf_model.feature_importances_):
    feats[feature] = importance
    
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances = importances.sort_values(by='Gini-importance',ascending = False)

In [None]:
importances.head(10)

# Logistic Model

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
f1_score(y_test,log_model.predict(X_test))

In [None]:
f1_score(y_test,log_model.predict(X_test))

In [None]:
recall_score(y_test,log_model.predict(X_test))

In [None]:
#modeling_df.to_pickle('modeling_df_final.pkl')

# Naive Bayes

In [None]:
clf = MultinomialNB().fit(X_train, y_train)
# X_test_counts = count_vect.transform(X_test[:,4])
# X_test_tfidf = tfidf_transformer.transform(X_test_counts)
f1_score(y_test, clf.predict(X_test))

# Gradient Boosting

In [None]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

In [None]:
print(f1_score(y_test, xgb_model.predict(X_test)))
print(recall_score(y_test, xgb_model.predict(X_test)))

In [None]:
feats = {} 
for feature, importance in zip(feature_names, xgb_model.feature_importances_):
    feats[feature] = importance
    
importances2 = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances2 = importances2.sort_values(by='Gini-importance',ascending = False)

In [None]:
importances2.head(20)