In [44]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from os.path import expanduser
import re
from nltk.stem.porter import PorterStemmer
import nltk

In [45]:
stop_words = [word.strip() for word in open('stop_words.txt').readlines()]

In [46]:
def stemming_tokenizer(str_input):
    porter_stemmer = PorterStemmer()
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [47]:
with open('dems.txt', 'r',encoding="utf-8") as file:
    dem_text = [line.strip('\n') for line in file]
with open('gop.txt', 'r',encoding="utf-8") as file:
    gop_text = [line.strip('\n') for line in file]
with open('NonPolitical.txt', 'r',encoding="utf-8") as file:
    nonp_text = [line.strip('\n') for line in file]

In [48]:
dem=np.array(dem_text)
gop=np.array(gop_text)
nonp=np.array(nonp_text)

In [49]:
dem_df = pd.DataFrame({'tweet': dem})
dem_df['label']=0
gop_df = pd.DataFrame({'tweet': gop})
gop_df['label']=1
nonp_df = pd.DataFrame({'tweet': nonp})
nonp_df['label']=2

In [50]:
tweets=[dem_df,gop_df,nonp_df]
tweets_df=pd.concat(tweets,ignore_index=True)

In [51]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc=re.sub(r'-',' ',doc).strip()
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc=re.sub(r'#','',doc).strip() #removing #symbol
    doc=re.sub(r'RT[\s]+','',doc).strip()
    doc = re.sub(r'http[a-zA-Z]*', '', doc).strip()
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [52]:
norm_dem=normalize_corpus(dem_df['tweet'])
norm_gop=normalize_corpus(gop_df['tweet'])
norm_nonp=normalize_corpus(nonp_df['tweet'])

In [53]:
norm_tweets=np.concatenate((norm_dem, norm_gop,norm_nonp), axis=None)

In [54]:
from gensim.models import word2vec

# tokenize sentences in corpus
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_tweets]

# Set values for various parameters
feature_size = 100    # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=50)

In [55]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


# get document level embeddings
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
tweet_w2c=pd.DataFrame(w2v_feature_array)
tweet_w2c

  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.623068,0.146551,1.229251,1.251837,0.409443,0.894555,-1.199981,-0.364912,0.151711,0.145984,...,0.124447,-0.123198,-1.012831,0.100166,-0.052910,-0.252355,-0.955425,0.119040,-0.453537,-0.515122
1,-0.103010,-0.501528,0.322952,0.373686,0.524534,0.221246,-1.325071,-1.565196,-1.164852,0.436400,...,-0.289928,0.037972,-1.103589,-2.356120,1.459980,1.663123,0.033925,-0.056120,0.144457,-2.819482
2,-0.907772,0.068927,-0.601583,-0.504977,1.455380,-0.660430,-0.488723,0.062725,0.549753,-0.325872,...,0.724763,0.735749,-0.724880,-0.405693,0.746892,-0.523164,-1.124800,-0.153841,0.086592,-0.543952
3,-0.768057,-1.565644,0.586879,1.118854,1.048649,-1.426532,0.848696,-0.248470,-1.019311,0.766040,...,0.988525,-0.076406,0.911665,-1.336391,-0.289586,-0.077546,-0.890714,-1.463279,-1.065610,-1.757756
4,-0.063766,-1.428503,0.106107,1.333769,-0.197847,0.250676,-0.422370,-0.868437,0.181162,0.174691,...,0.615044,-0.407492,-0.706851,-1.284176,1.855251,-0.994677,-0.715710,0.045751,-1.571971,-1.571614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51259,0.364222,-0.544441,-0.597272,-0.031771,0.033734,-0.620527,-0.113756,0.366899,-1.430731,0.014100,...,0.464129,-0.418852,-0.097145,1.201959,-0.656091,-0.108371,0.592180,1.446526,1.392030,1.149763
51260,-0.147590,-0.830328,-1.581300,0.064772,-0.442637,0.003115,-0.088177,-0.936537,-0.803143,1.187103,...,-1.070562,-0.300543,-1.019120,-0.142281,0.026141,0.244655,-0.484407,-0.483130,1.408323,-0.542081
51261,1.559632,0.061646,1.816210,0.433380,1.145361,-0.310361,0.206753,-0.628503,-1.634803,0.886144,...,1.483441,1.072332,-0.043770,-0.725508,0.974429,0.802671,-0.367423,0.034357,-0.199419,0.211125
51262,0.415252,-0.550736,-1.139472,0.052513,0.009047,-0.704693,-0.561117,-1.250706,-0.694180,0.245998,...,-0.258692,-0.488286,0.363064,0.077520,0.125586,-0.246778,0.088027,0.411652,1.572580,0.287683


In [56]:
# tweet_w2c.to_csv("word2vector.csv",index=False)

In [57]:
tweetw2c_df=pd.concat([tweet_w2c,tweets_df], axis=1)

In [58]:
tweetw2c_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,tweet,label
0,0.623068,0.146551,1.229251,1.251837,0.409443,0.894555,-1.199981,-0.364912,0.151711,0.145984,...,-1.012831,0.100166,-0.052910,-0.252355,-0.955425,0.119040,-0.453537,-0.515122,This week @senatemajldr said workers don’t nee...,0
1,-0.103010,-0.501528,0.322952,0.373686,0.524534,0.221246,-1.325071,-1.565196,-1.164852,0.436400,...,-1.103589,-2.356120,1.459980,1.663123,0.033925,-0.056120,0.144457,-2.819482,Health care professionals are on the front lin...,0
2,-0.907772,0.068927,-0.601583,-0.504977,1.455380,-0.660430,-0.488723,0.062725,0.549753,-0.325872,...,-0.724880,-0.405693,0.746892,-0.523164,-1.124800,-0.153841,0.086592,-0.543952,RT @SeemaNanda: Good to see @Google signal a c...,0
3,-0.768057,-1.565644,0.586879,1.118854,1.048649,-1.426532,0.848696,-0.248470,-1.019311,0.766040,...,0.911665,-1.336391,-0.289586,-0.077546,-0.890714,-1.463279,-1.065610,-1.757756,Republicans keep admitting that voter suppress...,0
4,-0.063766,-1.428503,0.106107,1.333769,-0.197847,0.250676,-0.422370,-0.868437,0.181162,0.174691,...,-0.706851,-1.284176,1.855251,-0.994677,-0.715710,0.045751,-1.571971,-1.571614,RT @SpeakerPelosi: The Congress has so far pas...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51259,0.364222,-0.544441,-0.597272,-0.031771,0.033734,-0.620527,-0.113756,0.366899,-1.430731,0.014100,...,-0.097145,1.201959,-0.656091,-0.108371,0.592180,1.446526,1.392030,1.149763,RT @RecordingAcad: Who is nominated in the Gen...,2
51260,-0.147590,-0.830328,-1.581300,0.064772,-0.442637,0.003115,-0.088177,-0.936537,-0.803143,1.187103,...,-1.019120,-0.142281,0.026141,0.244655,-0.484407,-0.483130,1.408323,-0.542081,RT @WSJ: Instagram users can now turn off comm...,2
51261,1.559632,0.061646,1.816210,0.433380,1.145361,-0.310361,0.206753,-0.628503,-1.634803,0.886144,...,-0.043770,-0.725508,0.974429,0.802671,-0.367423,0.034357,-0.199419,0.211125,.@valiswiser is on a mission to help people ov...,2
51262,0.415252,-0.550736,-1.139472,0.052513,0.009047,-0.704693,-0.561117,-1.250706,-0.694180,0.245998,...,0.363064,0.077520,0.125586,-0.246778,0.088027,0.411652,1.572580,0.287683,RT @TechCrunch: Instagram fights abuse with co...,2


In [59]:
x=tweetw2c_df.drop(['tweet','label'],axis=1)

In [60]:
y=tweetw2c_df['label']

In [61]:
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [62]:
naive_bayes = BernoulliNB()
model = naive_bayes.fit(x_train, y_train)

In [63]:
y_predictions = model.predict(x_test)

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predictions)

0.732521847690387

In [65]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state = 42) 
log_classifier = LogisticRegression(multi_class='multinomial',solver ='newton-cg')
log_classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
y_pred = log_classifier.predict(X_test)

In [67]:
log_classifier.score(X_test, y_test) 

0.8238920099875156

In [68]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[3901,  750,  188],
       [ 771, 3696,  233],
       [ 132,  183, 2962]])

In [69]:
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat



array([[3898,  754,  187],
       [ 771, 3679,  250],
       [ 144,  171, 2962]])

In [70]:
model.score(X_test, y_test) 

0.8223314606741573