In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from os.path import expanduser
import re
from nltk.stem.porter import PorterStemmer
import nltk

In [2]:
stop_words = [word.strip() for word in open('stop_words.txt').readlines()]

In [3]:
def stemming_tokenizer(str_input):
    porter_stemmer = PorterStemmer()
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [4]:
with open('dems.txt', 'r',encoding="utf-8") as file:
    dem_text = [line.strip('\n') for line in file]
with open('gop.txt', 'r',encoding="utf-8") as file:
    gop_text = [line.strip('\n') for line in file]
with open('NonPolitical.txt', 'r',encoding="utf-8") as file:
    nonp_text = [line.strip('\n') for line in file]

In [5]:
dem=np.array(dem_text)
gop=np.array(gop_text)
nonp=np.array(nonp_text)

In [6]:
dem_df = pd.DataFrame({'tweet': dem})
dem_df['label']=0
gop_df = pd.DataFrame({'tweet': gop})
gop_df['label']=1
nonp_df = pd.DataFrame({'tweet': nonp})
nonp_df['label']=2

In [7]:
tweets=[dem_df,gop_df,nonp_df]
tweets_df=pd.concat(tweets,ignore_index=True)

In [8]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc=re.sub(r'-',' ',doc).strip()
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc=re.sub(r'#','',doc).strip() #removing #symbol
    doc=re.sub(r'RT[\s]+','',doc).strip()
    doc = re.sub(r'http[a-zA-Z]*', '', doc).strip()
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [9]:
norm_dem=normalize_corpus(dem_df['tweet'])
norm_gop=normalize_corpus(gop_df['tweet'])
norm_nonp=normalize_corpus(nonp_df['tweet'])

In [10]:
norm_tweets=np.concatenate((norm_dem, norm_gop,norm_nonp), axis=None)

In [13]:
from gensim.models.fasttext import FastText

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_tweets]

# Set values for various parameters
feature_size = 100    # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 5   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words


ft_model = FastText(tokenized_corpus, size=feature_size, window=window_context, 
                    min_count=min_word_count,sample=sample, sg=1, iter=50)

KeyboardInterrupt: 

In [23]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


# get document level embeddings
ft_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=ft_model,
                                             num_features=feature_size)
tweet_ft=pd.DataFrame(ft_feature_array)
tweet_ft

  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.054465,0.017652,0.057126,0.328237,-0.085430,0.128090,-0.086746,0.079687,0.019102,-0.012940,...,0.035275,-0.162782,-0.121593,-0.067105,0.088611,0.066156,-0.135727,0.007506,0.036671,0.186657
1,-0.108472,-0.185452,0.107285,0.114326,-0.065444,-0.065317,-0.228195,-0.001598,-0.077891,-0.039719,...,0.026372,0.040735,0.007004,0.139331,0.387488,0.096257,-0.070339,-0.179284,0.274889,0.060320
2,-0.017021,-0.194911,-0.071974,0.174593,0.058325,0.328171,-0.209054,0.219576,-0.153309,-0.097044,...,-0.157658,-0.128863,-0.023152,0.032851,0.091259,0.093140,-0.142034,-0.089778,-0.012982,0.055193
3,0.011983,-0.331382,-0.343084,0.271919,0.081006,0.382438,-0.037547,0.123154,-0.181144,0.021704,...,-0.251193,-0.187517,0.031942,0.032144,0.177956,-0.154777,-0.117057,-0.097416,0.136062,-0.119283
4,-0.002666,0.123784,0.194492,0.523104,-0.138276,0.271805,-0.134554,0.010373,0.024799,0.136936,...,-0.045197,-0.227423,-0.236232,-0.178541,0.120102,-0.193745,-0.050619,0.034531,0.211434,0.149790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51259,-0.135934,-0.147382,-0.028985,0.166438,-0.080925,0.183791,-0.087949,0.391691,0.198972,-0.177645,...,-0.123869,-0.031222,-0.068465,0.014227,-0.032453,0.005247,-0.126470,0.051706,-0.011188,0.050060
51260,0.087971,-0.177188,0.314846,0.219543,0.402228,0.164714,-0.369832,0.347020,-0.193511,0.002096,...,-0.133168,-0.128057,-0.018000,-0.002734,-0.001974,0.280644,0.032183,-0.095871,-0.100626,0.368169
51261,0.150508,-0.241392,-0.081709,0.140822,0.508776,0.590009,-0.199491,-0.006691,-0.281000,-0.033489,...,0.079114,0.114753,0.156202,0.163266,0.277102,-0.017011,-0.196052,-0.064119,-0.009067,0.028108
51262,-0.068539,-0.162230,0.220101,0.049081,0.099600,-0.107287,-0.380536,0.260807,-0.045248,-0.059895,...,-0.130126,-0.040220,-0.242087,0.235974,-0.000643,0.059560,-0.175171,-0.130933,-0.028605,0.019565


In [24]:
# tweet_ft.to_csv("FastText.csv",index=False)

In [25]:
tweetft_df=pd.concat([tweet_ft,tweets_df], axis=1)

In [26]:
from sklearn.model_selection import train_test_split
x=tweetft_df.drop(['tweet','label'],axis=1)
y=tweetft_df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [27]:
from sklearn.naive_bayes import BernoulliNB
naive_bayes = BernoulliNB()
model = naive_bayes.fit(x_train, y_train)
y_predictions = model.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predictions)

0.6980337078651685

In [28]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state = 42) 
log_classifier = LogisticRegression(multi_class='multinomial',solver ='newton-cg')
log_classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
y_pred = log_classifier.predict(X_test)

In [30]:
log_classifier.score(X_test, y_test) 

0.8224875156054932

In [31]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[3887,  809,  143],
       [ 791, 3689,  220],
       [ 129,  183, 2965]])

In [32]:
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[3874,  795,  170],
       [ 794, 3664,  242],
       [ 123,  176, 2978]])

In [33]:
model.score(X_test, y_test) 

0.8205368289637952

In [35]:
similar_words = {search_term: [item[0] for item in ft_model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['gop', 'dem', 'vote', 'attack', 'administration', 'voters', 'clinton','win','donald trump']}
similar_words

{'gop': ['republican',
  'republicans',
  'senators',
  'goptaxscam',
  'jecrepublicans'],
 'dem': ['dems', 'democrat', 'democratic', 'democrats', 'chair'],
 'vote': ['voting',
  'electionsmatter',
  'ballot',
  'washingtonprimary',
  'registered'],
 'attack': ['attacks', 'disgraced', 'frightening', 'antisemitism', 'waged'],
 'administration': ['administrations',
  'admin',
  'trump',
  'cruelly',
  'president'],
 'voters': ['ballot', 'polls', 'republicans', 'election', 'toxic'],
 'clinton': ['hillary',
  'clintons',
  'hillaryclinton',
  'mishandling',
  'professors'],
 'win': ['chance', 'compete', 'huge', 'geaux', 'winner'],
 'donald trump': ['trump',
  'donald',
  'realdonaldtrump',
  'president',
  'enablers']}