In [26]:
import numpy as np
import pandas as pd
import json
import sklearn
from sklearn.preprocessing import LabelEncoder 
from pandas.io.json import json_normalize
import seaborn as sns
sns.set_style("whitegrid")
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split

### Useful Sections:
Regression Notebook 3 "Why add Qualitative Data to Regression"

Regression Notebook 3 "How to Choose the Best Model" - k-fold cross validation

Unsupervised Learning Notebook 1 - k-Means clustering

# Cleaning

In [2]:
bot_labels = pd.read_csv('data/botwiki-2019.tsv', names = ['id', 'identity'], sep = '\t')
human_labels = pd.read_csv('data/verified-2019.tsv', names = ['id', 'identity'], sep = '\t')
bot_labels['bot'] = 1
human_labels['bot'] = 0
#IMPROVE?
# First set all beers as 0
#beers_train['Stout'] = 0
# Then locate all the stouts and set Stout to 1
#beers_train.loc[beers_train.Beer_Type == "Stout",'Stout'] = 1

In [3]:
#if IOUP data rate exceeded: jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

#convert json to DataFrame
file = open('data/botwiki-2019_tweets.json')
file1 = open('data/verified-2019_tweets.json')
jsn = json.load(file)
jsn1 = json.load(file1)
bot_accts = pd.DataFrame(pd.json_normalize(jsn))
human_accts = pd.DataFrame(pd.json_normalize(jsn1))

In [4]:
#merge users and labels
bot_accts = bot_accts.merge(bot_labels, left_on=bot_accts['user.id'], right_on=bot_labels['id'])
human_accts = human_accts.merge(human_labels, left_on=human_accts['user.id'], right_on=human_labels['id'])
accts = bot_accts.append(human_accts)
#accts.to_csv('bot_wiki_verified_19.csv', index = False)

In [5]:
#filters accts to only numerical variables for simple knn

accts_knn = accts.filter(items = ['user.id', 'user.followers_count', 'user.friends_count', 'user.listed_count', 'user.favourites_count', 
                    'bot'])

accts_knn.tail()

Unnamed: 0,user.id,user.followers_count,user.friends_count,user.listed_count,user.favourites_count,bot
1982,76235237,3020,790,18,1707,0
1983,2159329092,4074,4239,163,4777,0
1984,25527618,17014,2610,608,7765,0
1985,43654274,367516,20245,5470,10282,0
1986,19084210,55512,11725,1366,2429,0


# Loading Data

In [18]:
accts = pd.read_csv('data/Cumulative_raw_data.csv')

In [19]:
pd.set_option('display.max_colwidth', None) # prevents descriptions from being cut off

In [20]:
accts['description'] = accts['description'].values.astype('U') #formating as unicode for Tfidf

In [21]:
accts = accts[:10000] #shortening for exploratory data work to make easier on computer
print(len(accts))

10000


In [22]:
#One hot encoding bot
accts['bot'] = 0
accts.loc[accts.species == "bot",'bot'] = 1
accts.head()

Unnamed: 0.1,Unnamed: 0,follow_request_sent,has_extended_profile,profile_use_background_image,default_profile_image,profile_background_image_url_https,verified,translator_type,profile_text_color,profile_image_url_https,...,created_at,contributors_enabled,time_zone,protected,default_profile,is_translator,species,source,withheld_in_countries,bot
0,602249341,False,False,False,False,https://abs.twimg.com/images/themes/theme4/bg.gif,False,none,0,https://pbs.twimg.com/profile_images/923924342974578688/k5RCrlSQ_normal.jpg,...,Thu Jun 07 22:16:27 +0000 2012,False,London,False,False,False,human,botometer-feedback-2019,,0
1,797617218511060992,False,True,False,False,https://abs.twimg.com/images/themes/theme1/bg.png,False,none,0,https://pbs.twimg.com/profile_images/855244571697061888/YDpSuAXy_normal.jpg,...,Sun Nov 13 01:48:58 +0000 2016,False,Pacific Time (US & Canada),False,False,False,bot,botometer-feedback-2019,,1
2,889925474,False,False,True,False,https://pbs.twimg.com/profile_background_images/723350044/881da62975b5e0dc69fc5903a094c9df.jpeg,False,none,333333,https://pbs.twimg.com/profile_images/964079832295288832/WSnP8xQ-_normal.jpg,...,Thu Oct 18 23:19:38 +0000 2012,False,,False,False,False,human,botometer-feedback-2019,,0
3,96435556,False,True,True,False,https://abs.twimg.com/images/themes/theme6/bg.gif,False,none,333333,https://pbs.twimg.com/profile_images/3114299697/f73cd7211520f13e8121d4ccd5db2481_normal.jpeg,...,Sat Dec 12 22:53:04 +0000 2009,False,Rome,False,False,False,bot,botometer-feedback-2019,,1
4,16905397,False,False,True,False,https://pbs.twimg.com/profile_background_images/3248110/DSCF4726.jpg,False,regular,666666,https://pbs.twimg.com/profile_images/969705141644410880/MJasCoOk_normal.jpg,...,Wed Oct 22 13:43:42 +0000 2008,False,Bern,False,False,False,human,botometer-feedback-2019,,0


# Transmuting Text to Numbers

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [12]:
text = open('sample.txt', 'w', encoding = 'utf-8')

In [234]:
vectorizer = CountVectorizer()
for desc in accts['description']:
       text.write(desc.strip().replace('\n+', ''))
text.close

<function TextIOWrapper.close()>

In [235]:
text = open('sample.txt', 'r', encoding = 'utf-8')

In [236]:
vectorizer.fit(text)
#print(vectorizer.vocabulary_)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [240]:
#for i in range(len(accts['description'])):
#    text = accts['description'][i]
#    accts['user.description'][i] = vectorizer.transform(text)
#accts['desc_vectorized'] = vectorizer.transform(accts['description'])
#accts['desc_vectorized'] = accts['desc_vectorized'].toarray()

### TfidfVectorizer

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
#accts_copy = accts.copy()
#accts_train = accts_copy.sample(frac = 0.75, random_state = 1400)
#accts_test = accts_copy.drop(accts_train.index)

In [29]:
y = accts[['bot']].to_numpy()
X = accts[['description']]
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=1400)
accts_tfidf_train = accts_knn.sample(frac = 0.75,  random_state = 876)

In [30]:
vectorizer = TfidfVectorizer()

In [31]:
#print(accts['user.description'])
vectorizer.fit(X_train['description'].tolist())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [32]:
#print(vectorizer.vocabulary_)

In [33]:
print(vectorizer.idf_)

[8.31335371 7.84335008 9.22964444 ... 9.22964444 9.22964444 9.22964444]


In [34]:
X_train = vectorizer.transform(X_train['description'])
#vector = vector.toarray()
#print(vector.reshape(-1,1))

In [39]:
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, y_train.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [40]:
X_test = vectorizer.transform(X_test['description'])
y_predict = knn.predict(X_test)
print("Our model has a ",
      np.round(sum(y_predict == y_test.ravel())/len(y_test)*100,2),
      "% accuracy on the testing set")

Our model has a  77.0 % accuracy on the testing set


### Hashing

In [488]:
from sklearn.feature_extraction.text import HashingVectorizer

In [489]:
y = accts[['bot']].to_numpy()
X = accts[['description']]
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=1400)
accts_tfidf_train = accts_knn.sample(frac = 0.75,  random_state = 876)

In [490]:
h_vectorizer = HashingVectorizer(n_features = 30)

In [491]:
X_train = h_vectorizer.transform(X_train['description'])

In [492]:
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(X_train, y_train.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [493]:
X_test = h_vectorizer.transform(X_test['description'])
y_predict = knn.predict(X_test)
print("Our model has a ",
      np.round(sum(y_predict == y_test.ravel())/len(y_test)*100,2),
      "% accuracy on the testing set")

Our model has a  77.24 % accuracy on the testing set


### CountVectorizer

In [350]:
from sklearn.feature_extraction.text import CountVectorizer

In [351]:
y = accts[['bot']].to_numpy()
X = accts[['description']]
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=1400)
accts_tfidf_train = accts_knn.sample(frac = 0.75,  random_state = 876)

In [352]:
c_vectorizer = CountVectorizer()

In [353]:
c_vectorizer.fit(X_train['description'].tolist())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [354]:
X_train = c_vectorizer.transform(X_train['description'])

In [355]:
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, y_train.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [356]:
X_test = c_vectorizer.transform(X_test['description'])
y_predict = knn.predict(X_test)
print("Our model has a ",
      np.round(sum(y_predict == y_test.ravel())/len(y_test)*100,2),
      "% accuracy on the testing set")

Our model has a  80.44 % accuracy on the testing set


### Combining TfidfVectorization, Hashing, CountVectorization

In [441]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [442]:
y = accts[['bot']].to_numpy()
X = accts[['description']]
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=1400)
accts_tfidf_train = accts_knn.sample(frac = 0.75,  random_state = 876)

In [443]:
t_vectorizer = TfidfVectorizer()
h_vectorizer = HashingVectorizer(n_features = 20)
c_vectorizer = CountVectorizer()

In [444]:
t_vectorizer.fit(X_train['description'].tolist())
c_vectorizer.fit(X_train['description'].tolist())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [445]:
X_t_train = t_vectorizer.transform(X_train['description'])
X_h_train = h_vectorizer.transform(X_train['description'])
X_c_train = c_vectorizer.transform(X_train['description'])
#X_train = pd.concat([X_t_train, X_h_train, X_c_train], axis=1)
#X_train.head()
#X_t_train = pd.DataFrame(X_t_train.todense(), columns=t_vectorizer.get_feature_names())
#X_train_full = zip(X_t_train, X_h_train, X_c_train)
#print(X_train)
#test = pd.DataFrame.sparse.from_spmatrix(X_t_train)

In [446]:
knn_t = KNeighborsClassifier(n_neighbors = 11)
knn_t.fit(X_t_train, y_train.ravel())

knn_h = KNeighborsClassifier(n_neighbors = 11)
knn_h.fit(X_h_train, y_train.ravel())

knn_c = KNeighborsClassifier(n_neighbors = 11)
knn_c.fit(X_c_train, y_train.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [447]:
X_t_test = t_vectorizer.transform(X_test['description'])
X_h_test = h_vectorizer.transform(X_test['description'])
X_c_test = c_vectorizer.transform(X_test['description'])

In [456]:
y_t_predict = knn_t.predict(X_t_test)
y_h_predict = knn_h.predict(X_h_test)
y_c_predict = knn_c.predict(X_c_test)

y_predict = np.add(y_t_predict, y_h_predict, y_c_predict)
for i in range(len(y_predict)):
    if y_predict[i] > 0:
        y_predict[i] = 1
    else:
        y_predict[i] = 0
print("Our model has a ",
      np.round(sum(y_predict == y_test.ravel())/len(y_test)*100,2),
      "% accuracy on the testing set")

Our model has a  75.85 % accuracy on the testing set


# K Nearest Neighbors

In [38]:
#K-fold cross validation
from sklearn.model_selection import KFold
from sklearn.base import clone

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [223]:
#Creating test train split for knn
X = accts[['followers_count', 'friends_count', 'listed_count', 'favourites_count', ]].to_numpy()
y = accts[['bot']].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=1400)
accts_knn_train = accts_knn.sample(frac = 0.75,  random_state = 876)

In [224]:
knn = KNeighborsClassifier(n_neighbors = 10)

In [225]:
knn.fit(X_train, y_train.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [226]:
y_predict = knn.predict(X_test)

In [227]:
print("Our model has a ",
      np.round(sum(y_predict == y_test.ravel())/len(y_test)*100,2),
      "% accuracy on the testing set")

Our model has a  88.01 % accuracy on the testing set


# K-means Clustering

In [27]:
from sklearn.cluster import KMeans

In [28]:
kmeans = KMeans(2)

In [29]:
kmeans.fit(accts_train)

ValueError: could not convert string to float: 'bot'

In [None]:
#Test Train Split
#accts_copy = accts.copy()
#accts_train = accts_copy.sample(frac = 0.75, random_state = 1400)
#accts_test = accts_copy.drop(accts_train.index)