In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from os.path import expanduser
import re
from nltk.stem.porter import PorterStemmer
import nltk

In [2]:
stop_words = [word.strip() for word in open('stop_words.txt').readlines()]

In [3]:
def stemming_tokenizer(str_input):
    porter_stemmer = PorterStemmer()
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [4]:
with open('dems.txt', 'r',encoding="utf-8") as file:
    dem_text = [line.strip('\n') for line in file]
with open('gop.txt', 'r',encoding="utf-8") as file:
    gop_text = [line.strip('\n') for line in file]
with open('NonPolitical.txt', 'r',encoding="utf-8") as file:
    nonp_text = [line.strip('\n') for line in file]

In [5]:
dem=np.array(dem_text)
gop=np.array(gop_text)
nonp=np.array(nonp_text)

In [6]:
dem_df = pd.DataFrame({'tweet': dem})
dem_df['label']=0
gop_df = pd.DataFrame({'tweet': gop})
gop_df['label']=1
nonp_df = pd.DataFrame({'tweet': nonp})
nonp_df['label']=2

In [7]:
tweets=[dem_df,gop_df,nonp_df]
tweets_df=pd.concat(tweets,ignore_index=True)

In [9]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc=re.sub(r'-',' ',doc).strip()
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc=re.sub(r'#','',doc).strip() #removing #symbol
    doc=re.sub(r'RT[\s]+','',doc).strip()
    doc = re.sub(r'http[a-zA-Z]*', '', doc).strip()
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [10]:
norm_dem=normalize_corpus(dem_df['tweet'])
norm_gop=normalize_corpus(gop_df['tweet'])
norm_nonp=normalize_corpus(nonp_df['tweet'])

In [19]:
norm_tweets=np.concatenate((norm_dem, norm_gop,norm_nonp), axis=None)

In [21]:
from gensim.models.fasttext import FastText

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_tweets]

# Set values for various parameters
feature_size = 100    # Word vector dimensionality  
window_context = 50          # Context window size                                                                                    
min_word_count = 5   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words


ft_model = FastText(tokenized_corpus, size=feature_size, window=window_context, 
                    min_count=min_word_count,sample=sample, sg=1, iter=50)

unable to import 'smart_open.gcs', disabling that module


In [22]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


# get document level embeddings
ft_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=ft_model,
                                             num_features=feature_size)
tweet_ft=pd.DataFrame(ft_feature_array)
tweet_ft

  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.131927,0.072587,0.148973,0.179304,-0.034604,0.055214,-0.215612,0.134485,-0.046035,0.016965,...,0.156378,-0.175986,-0.124601,0.011181,0.096156,0.004070,-0.161479,-0.011840,-0.032572,0.280729
1,-0.099689,-0.038240,0.264251,0.122051,-0.050683,-0.116872,-0.250062,-0.112182,-0.007713,0.017609,...,0.060585,0.126893,0.101561,0.145015,0.332763,0.162034,-0.099668,0.137067,0.040090,0.155525
2,-0.206245,-0.008839,0.147725,0.090257,0.105787,0.209830,-0.130037,0.031220,-0.268257,0.124062,...,-0.078372,-0.027156,-0.065762,0.068860,0.110813,0.082470,-0.117253,0.063413,-0.026084,-0.093323
3,-0.014056,-0.129698,0.064914,0.201380,0.015062,0.274785,-0.277500,-0.069255,-0.048947,-0.017775,...,-0.266125,-0.216330,-0.198291,0.158487,0.282376,-0.120763,0.097646,0.085710,0.125607,0.136412
4,-0.075753,0.345024,0.068511,0.397493,-0.078619,0.259986,-0.085486,0.007347,-0.084314,0.069333,...,0.108897,-0.221188,-0.127309,0.008877,0.190784,0.130552,-0.085027,0.038165,0.173829,0.252243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51259,-0.037639,-0.261345,0.010619,0.086037,0.124329,0.186299,0.104132,0.218581,0.099928,0.060806,...,0.115560,0.040928,-0.168453,-0.158409,-0.192831,0.194649,-0.205201,-0.135646,-0.120890,-0.098033
51260,-0.280390,-0.037974,-0.016714,0.058434,0.267959,0.157820,-0.156949,-0.007220,-0.128691,-0.135120,...,0.056162,-0.128397,-0.120808,0.104969,-0.281571,0.325738,0.090204,-0.356875,-0.151682,0.059654
51261,-0.085223,-0.027967,0.329221,0.177641,0.340441,0.147450,-0.174177,-0.022423,-0.208540,0.067974,...,0.061531,-0.068941,-0.094095,0.293323,0.352616,0.102702,-0.199347,0.018171,0.030810,-0.225234
51262,-0.303152,-0.042140,0.152902,-0.005012,0.231356,-0.016039,0.051919,0.029954,-0.015057,0.041116,...,0.131339,0.110444,0.028329,0.071822,-0.187839,-0.027097,-0.036833,-0.097223,-0.192827,0.071055


In [24]:
# tweet_ft.to_csv("FastText.csv",index=False)

In [25]:
tweetft_df=pd.concat([tweet_ft,tweets_df], axis=1)

In [28]:
from sklearn.model_selection import train_test_split
x=tweetft_df.drop(['tweet','label'],axis=1)
y=tweetft_df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [29]:
from sklearn.naive_bayes import BernoulliNB
naive_bayes = BernoulliNB()
model = naive_bayes.fit(x_train, y_train)
y_predictions = model.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predictions)

0.7056803995006242

In [30]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state = 42) 
log_classifier = LogisticRegression(multi_class='multinomial',solver ='newton-cg')
log_classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
y_pred = log_classifier.predict(X_test)

In [32]:
log_classifier.score(X_test, y_test) 

0.8205368289637952

In [33]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[3920,  806,  113],
       [ 816, 3693,  191],
       [ 186,  188, 2903]])

In [34]:
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[3931,  783,  125],
       [ 825, 3669,  206],
       [ 183,  184, 2910]])

In [35]:
model.score(X_test, y_test) 

0.8200686641697877