In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder 
from pandas.io.json import json_normalize
import seaborn as sns
from pandas.plotting import scatter_matrix
sns.set_style("whitegrid")

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import TransformedTargetRegressor

# Loading Data

In [8]:
accts = pd.read_csv('data/Cumulative_raw_data.csv')
pd.set_option('display.max_colwidth', None) # prevents descriptions from being cut off
accts['description'] = accts['description'].values.astype('U') #formating as unicode for Tfidf
#accts = accts[:10000] #shortening for exploratory data work to make easier on computer

In [9]:
#One hot encoding bot
accts['bot'] = 0
accts.loc[accts.species == "bot",'bot'] = 1

In [10]:
y = accts[['bot']].to_numpy()
X = accts[['description']]
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=1400)


# Voting Algo
### TfidfVectorization, Hashing, CountVectorization, Basic KNN

In [32]:
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV

In [17]:
# Creating models
#t_vectorizer = TfidfVectorizer()
#h_vectorizer = HashingVectorizer(n_features = 20)
#c_vectorizer = CountVectorizer()

In [18]:
#t_vectorizer.fit(X_train['description'].tolist())
#c_vectorizer.fit(X_train['description'].tolist())
#X_t_train = t_vectorizer.transform(X_train['description'])
#X_h_train = h_vectorizer.transform(X_train['description'])
#X_c_train = c_vectorizer.transform(X_train['description'])

In [19]:
#preprocessor = ColumnTransformer(
#    transformers=[
#        ('num', numeric_transformer, numeric_features),
#        ('cat', categorical_transformer, categorical_features)])

In [27]:
#transformer = ColumnTransformer(
#    transformers =[
#        ('tfidf', TfidfVectorizer())])

pipe_t = Pipeline(steps=[#('transform', TfidfVectorizer()), 
        ('tfidf', TfidfVectorizer()),
        ('knn', KNeighborsClassifier(n_neighbors = 10))]
                )

pipe_c = Pipeline(steps=[#('transform', TfidfVectorizer()), 
        ('count', CountVectorizer()),
        ('knn', KNeighborsClassifier(n_neighbors = 10))]
                )

pipe_h = Pipeline(steps=[#('transform', TfidfVectorizer()), 
        ('hashing', HashingVectorizer(n_features = 20)),
        ('knn', KNeighborsClassifier(n_neighbors = 10))])
    
desc_vote = VotingClassifier(estimators=[
    ('t', pipe_t), ('c', pipe_c), ('h', pipe_h)
], voting='soft')

desc_vote = desc_vote.fit(X_train['description'], y_train.ravel())

In [28]:
y_predict = desc_vote.predict(X_test['description'])

In [29]:
print("Our model has a ",
      np.round(sum(y_predict == y_test.ravel())/len(y_test)*100,2),
      "% accuracy on the testing set")

Our model has a  80.41 % accuracy on the testing set


In [36]:
grid_params = dict(
    voting__weights=[[0,0,1], [0,1,0], [1,0,0], [1, 1, 1], [1, 1, 0], [0, 1, 1], [1, 0, 1]]
)

In [39]:
grid_search = GridSearchCV(desc_vote, grid_params, n_jobs=-1, verbose=5, refit=True, scoring=None)
grid_search.fit(X_train['description'], y_train.ravel())

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  24 | elapsed:    1.7s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:    1.7s remaining:    0.2s


AttributeError: 'str' object has no attribute 'set_params'

# Adaboost

In [78]:
from sklearn.tree import DecisionTreeClassifier

In [79]:
X = accts[['followers_count', 'friends_count', 'listed_count', 'favourites_count', ]].to_numpy()
y = accts[['bot']].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=1400)

In [80]:
h_vectorizer = HashingVectorizer(n_features = 20)
h_vectorizer.fit(X_train['description'].tolist())
X_c_train = c_vectorizer.transform(X_train['description'])

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [81]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
            n_estimators = 20,
            algorithm="SAMME.R",
            learning_rate = 0.5
        )
ada_clf.fit(X_train, y_train.ravel())

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [82]:
#X_test = c_vectorizer.transform(X_test['description'])

In [83]:
y_predict = ada_clf.predict(X_test)
print("Our model has a ",
      np.round(sum(y_predict == y_test.ravel())/len(y_test)*100,2),
      "% accuracy on the testing set")

Our model has a  85.3 % accuracy on the testing set
