In [32]:
import pandas as pd
import numpy as np
import pickle
from textblob import TextBlob
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

## Twitter Data

In [3]:
twitter = pd.read_csv('../Data/twitter.csv')

In [4]:
twitter['text'].dropna(inplace = True)

### Count Vectorizing Twitter

In [5]:
cvec = pickle.load(open('../Assets/cvec.pkl', 'rb'))

In [6]:
twitter_cvec = cvec.transform(twitter['text'])

twitter_cvec_df = pd.DataFrame(twitter_cvec.toarray(), columns=cvec.get_feature_names())

In [7]:
# There are 17 too many columns here... Should be 1690
twitter_cvec_df.head()

Unnamed: 0,aba,aba woman,abandoned,abc,abc news,ablaze,able,absolutely,accident,according,...,young,youth,youth saved,youtube,youtube playlist,youtube video,yr,yr old,yyc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 

In [8]:
pickle_columns = pickle.load(open('../Assets/words_df_columns.pkl', 'rb'))

twitter_cvec_df = twitter_cvec_df[pickle_columns]

### TFIDF Vectorizing Twitter

In [9]:
tfidf = pickle.load(open('../Assets/tfidf.pkl', 'rb'))

In [10]:
twitter_tfidf = tfidf.transform(twitter['text'])

twitter_tfidf_df = pd.DataFrame(twitter_tfidf.toarray(), columns = tfidf.get_feature_names())

### Modeling

In [11]:
mnb = pickle.load(open('../Assets/multi_bayes.sav', 'rb'))

In [12]:
twitter_cvec_df.shape

(1655, 1707)

In [13]:
cvec_preds = mnb.predict(twitter_cvec_df)

In [18]:
twitter_preds = twitter.dropna(subset = ['text']).copy()
twitter_preds['prediction'] = cvec_preds
twitter_preds = twitter_preds[['text','prediction']]
twitter_preds['prediction'] = twitter_preds['prediction'].map({0:'Not Reliable', 1:'Reliable'})

In [24]:
twitter_preds

Unnamed: 0,text,prediction
0,Imagine being on a cruise ship that’s in the m...,Not Reliable
1,Este sitio web permite rastrear la propagación...,Reliable
2,-le tengo más miedo al tiro bajo que al corona...,Not Reliable
3,#CORONAVIRUS https://www.theguardian.com/world...,Reliable
4,#coronavirus #quarantined #ChinaCoronaVirus #C...,Not Reliable
...,...,...
1655,Maybe #coronavirus??? disease?,Not Reliable
1656,Cruise Giant Carnival Works to Manage Deepenin...,Reliable
1657,This corona virus must b like measles where it...,Not Reliable
1658,Cruise Giant Carnival Works to Manage Deepenin...,Reliable
