In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from os.path import expanduser
import re
from nltk.stem.porter import PorterStemmer
import nltk

In [2]:
stop_words = [word.strip() for word in open('stop_words.txt').readlines()]

In [3]:
def stemming_tokenizer(str_input):
    porter_stemmer = PorterStemmer()
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [4]:
with open('dems.txt', 'r',encoding="utf-8") as file:
    dem_text = [line.strip('\n') for line in file]
with open('gop.txt', 'r',encoding="utf-8") as file:
    gop_text = [line.strip('\n') for line in file]
with open('NonPolitical.txt', 'r',encoding="utf-8") as file:
    nonp_text = [line.strip('\n') for line in file]

In [5]:
dem=np.array(dem_text)
gop=np.array(gop_text)
nonp=np.array(nonp_text)

In [6]:
dem_df = pd.DataFrame({'tweet': dem})
dem_df['label']=0
gop_df = pd.DataFrame({'tweet': gop})
gop_df['label']=1
nonp_df = pd.DataFrame({'tweet': nonp})
nonp_df['label']=2

In [7]:
tweets=[dem_df,gop_df,nonp_df]
tweets_df=pd.concat(tweets,ignore_index=True)

In [8]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc=re.sub(r'-',' ',doc).strip()
    doc=re.sub(r'#','',doc).strip() #removing #symbol
    #doc=re.sub(r'#\S+','',doc).strip() #removing #symbol
    doc=re.sub(r'RT[\s]+','',doc).strip()
    doc = re.sub(r'http[a-zA-Z]*\S+', '', doc).strip()
    doc=re.sub(r'@[A-Za-z0-9]+','',doc).strip() #remove mentions
    doc=re.sub(r'[?|$|.|!|;|:|&|"|,|""|*|-|(|)_]','',doc).strip()
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [9]:
doc="Are you registered to vote in the August 7th primary? Check to make sure you're registered and your address is up to date! Click here: https://t.co/gvmxl7JOqd. #RegisterToVote #ElectionsMatter https://t.co/yZVOSYEsCj"
doc=re.sub(r'-',' ',doc).strip()
doc=re.sub(r'#\S+','',doc).strip() #removing #symbol
doc=re.sub(r'RT[\s]+','',doc).strip()
doc = re.sub(r'http[a-zA-Z]*\S+', '', doc).strip()
doc=re.sub(r'@[A-Za-z0-9]+','',doc).strip() #remove mentions
doc=re.sub(r'RT[\s]+','',doc).strip()
doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
doc=re.sub(r'[?|$|.|!|;|:|&|"|,|""|*|-|(|)_]','',doc).strip()
doc = doc.lower()
doc = doc.strip()
doc
    

'are you registered to vote in the august th primary check to make sure youre registered and your address is up to date click here'

In [10]:
norm_dem=normalize_corpus(dem_df['tweet'])
norm_gop=normalize_corpus(gop_df['tweet'])
norm_nonp=normalize_corpus(nonp_df['tweet'])

In [11]:
norm_tweets=np.concatenate((norm_dem, norm_gop,norm_nonp), axis=None)

In [12]:
from gensim.models.fasttext import FastText

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_tweets]

# Set values for various parameters
feature_size = 100    # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 5   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words


ft_model = FastText(tokenized_corpus, size=feature_size, window=window_context, 
                    min_count=min_word_count,sample=sample, sg=1, iter=50)

In [13]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


# get document level embeddings
ft_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=ft_model,
                                             num_features=feature_size)
tweet_ft=pd.DataFrame(ft_feature_array)
tweet_ft

  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.111033,-0.040851,0.043113,-0.111244,-0.128277,0.033710,0.336156,-0.197071,0.038905,-0.101501,...,0.153220,-0.277742,-0.070176,0.090182,0.087145,-0.011979,-0.118347,-0.049777,-0.045038,0.070733
1,-0.180061,0.014814,0.036705,-0.359167,-0.230676,0.030014,0.247105,-0.146752,0.216178,0.053780,...,0.249677,-0.100991,0.004420,0.119193,-0.097565,-0.108999,0.057919,0.092104,0.159954,0.146054
2,0.035550,0.051027,0.042041,-0.019618,-0.090525,0.010587,0.264320,-0.044597,0.035951,0.105611,...,0.033048,-0.173824,-0.177278,0.089884,0.142849,0.003414,-0.006583,-0.141437,-0.008733,0.159602
3,0.170778,0.154098,0.003786,-0.100290,0.053653,0.130752,0.264378,-0.423741,-0.006912,-0.117160,...,0.177013,0.004099,-0.232066,-0.018772,0.143109,-0.088286,0.182734,0.268858,0.098365,0.176847
4,0.072956,-0.130327,0.087728,0.005564,-0.196272,0.014259,0.303873,-0.228379,0.270803,-0.090300,...,0.114209,-0.019538,-0.127343,0.057023,0.001884,-0.006214,0.115848,0.045873,-0.046138,0.165398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51259,0.124993,-0.297604,-0.035642,-0.340041,-0.072041,0.008739,0.349559,0.032762,0.141073,0.003069,...,0.211428,-0.137917,-0.009618,0.134221,0.231884,0.126744,-0.042697,0.107099,-0.038124,0.204685
51260,0.118476,-0.144198,0.177618,-0.362290,0.060397,-0.059293,0.129206,-0.050758,0.272182,0.108726,...,0.206806,-0.046747,0.169915,-0.015274,0.051808,0.244087,0.245285,-0.158244,-0.117737,0.122326
51261,0.100345,0.378787,0.094219,-0.434943,0.033277,0.315459,0.210491,-0.313004,-0.046896,0.269839,...,0.125464,-0.276510,-0.110322,0.036250,-0.068362,-0.297122,0.012625,-0.059896,-0.034177,0.022843
51262,0.079366,0.032468,0.056638,-0.402173,0.097175,0.250982,0.302567,-0.029213,0.144977,-0.032277,...,0.169373,-0.188934,0.011305,0.143926,0.131420,0.249654,-0.030264,-0.007378,0.075654,-0.083187


In [14]:
# tweet_ft.to_csv("FastText.csv",index=False)

In [15]:
tweetft_df=pd.concat([tweet_ft,tweets_df], axis=1)

In [16]:
tweetft_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,tweet,label
0,-0.111033,-0.040851,0.043113,-0.111244,-0.128277,0.033710,0.336156,-0.197071,0.038905,-0.101501,...,-0.070176,0.090182,0.087145,-0.011979,-0.118347,-0.049777,-0.045038,0.070733,This week @senatemajldr said workers don’t nee...,0
1,-0.180061,0.014814,0.036705,-0.359167,-0.230676,0.030014,0.247105,-0.146752,0.216178,0.053780,...,0.004420,0.119193,-0.097565,-0.108999,0.057919,0.092104,0.159954,0.146054,Health care professionals are on the front lin...,0
2,0.035550,0.051027,0.042041,-0.019618,-0.090525,0.010587,0.264320,-0.044597,0.035951,0.105611,...,-0.177278,0.089884,0.142849,0.003414,-0.006583,-0.141437,-0.008733,0.159602,RT @SeemaNanda: Good to see @Google signal a c...,0
3,0.170778,0.154098,0.003786,-0.100290,0.053653,0.130752,0.264378,-0.423741,-0.006912,-0.117160,...,-0.232066,-0.018772,0.143109,-0.088286,0.182734,0.268858,0.098365,0.176847,Republicans keep admitting that voter suppress...,0
4,0.072956,-0.130327,0.087728,0.005564,-0.196272,0.014259,0.303873,-0.228379,0.270803,-0.090300,...,-0.127343,0.057023,0.001884,-0.006214,0.115848,0.045873,-0.046138,0.165398,RT @SpeakerPelosi: The Congress has so far pas...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51259,0.124993,-0.297604,-0.035642,-0.340041,-0.072041,0.008739,0.349559,0.032762,0.141073,0.003069,...,-0.009618,0.134221,0.231884,0.126744,-0.042697,0.107099,-0.038124,0.204685,RT @RecordingAcad: Who is nominated in the Gen...,2
51260,0.118476,-0.144198,0.177618,-0.362290,0.060397,-0.059293,0.129206,-0.050758,0.272182,0.108726,...,0.169915,-0.015274,0.051808,0.244087,0.245285,-0.158244,-0.117737,0.122326,RT @WSJ: Instagram users can now turn off comm...,2
51261,0.100345,0.378787,0.094219,-0.434943,0.033277,0.315459,0.210491,-0.313004,-0.046896,0.269839,...,-0.110322,0.036250,-0.068362,-0.297122,0.012625,-0.059896,-0.034177,0.022843,.@valiswiser is on a mission to help people ov...,2
51262,0.079366,0.032468,0.056638,-0.402173,0.097175,0.250982,0.302567,-0.029213,0.144977,-0.032277,...,0.011305,0.143926,0.131420,0.249654,-0.030264,-0.007378,0.075654,-0.083187,RT @TechCrunch: Instagram fights abuse with co...,2


In [17]:
from sklearn.model_selection import train_test_split
x=tweetft_df.drop(['tweet','label'],axis=1)
y=tweetft_df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [18]:
from sklearn.naive_bayes import BernoulliNB
naive_bayes = BernoulliNB()
model = naive_bayes.fit(x_train, y_train)
y_predictions = model.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predictions)

0.681023720349563

In [19]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state = 42) 
log_classifier = LogisticRegression(multi_class='multinomial',solver ='newton-cg')
log_classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_pred = log_classifier.predict(X_test)

In [21]:
log_classifier.score(X_test, y_test) 

0.7936173533083646

In [22]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[3693, 1024,  122],
       [ 936, 3575,  189],
       [ 163,  211, 2903]], dtype=int64)

In [23]:
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[3678, 1019,  142],
       [ 944, 3533,  223],
       [ 162,  192, 2923]], dtype=int64)

In [24]:
model.score(X_test, y_test) 

0.7907303370786517

In [25]:
similar_words = {search_term: [item[0] for item in ft_model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['gop', 'dem', 'vote', 'attack', 'administration', 'voters','win','trump']}
similar_words

{'gop': ['republicans', 'republican', 'senators', 'gut', 'toxic'],
 'dem': ['dems', 'meddling', 'democrat', 'socialists', 'democratic'],
 'vote': ['ballot', 'voting', 'nhprimary', 'polls', 'box'],
 'attack': ['attacks', 'attempt', 'tragically', 'horrific', 'waged'],
 'administration': ['trump', 'cruel', 'president', 'admin', 'cruelly'],
 'voters': ['nhprimary', 'republicans', 'electi', 'voter', 'primary'],
 'win': ['chance', 'compete', 'huge', 'victory', 'enter'],
 'trump': ['president', 'donald', 'administration', 'presidency', '—']}

In [26]:
import fasttext

In [109]:
tweets=[dem_df,gop_df,nonp_df]
tweets_df=pd.concat(tweets,ignore_index=True)

In [110]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc=re.sub(r'-',' ',doc).strip()
    #doc=re.sub(r'#\S+','',doc).strip() #removing #symbol
    doc=re.sub(r'RT[\s]+','',doc).strip()
    doc = re.sub(r'http[a-zA-Z]*\S+', '', doc).strip()
    #doc=re.sub(r'@[A-Za-z0-9]+','',doc).strip() #remove mentions
    doc=re.sub(r'[?|$|.|!|;|:|&|"|,|""|*|-|(|)|#|_|@]','',doc).strip()
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [111]:
tweets_df["tweet"]=tweets_df["tweet"].apply(normalize_document)

In [112]:
def add_prefix(colum):
    return "__label__"+str(colum)

In [113]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p)) #Precision
    print("R@{}\t{:.3f}".format(1, r)) #Recall

In [114]:
x=tweets_df["tweet"]
y=tweets_df["label"]

In [115]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [116]:
train_df=pd.DataFrame()
train_df["value"]=x_train
train_df["label"]=y_train
train_df

Unnamed: 0,value,label
14706,repcolinallred affordable care act helped mill...,0
10473,pattymurray ’ hearing — idea american presiden...,0
16801,donald trump ' muslim ban direct betrayal amer...,0
5871,last night — hours kavanaugh ’ hearings began ...,0
49117,connect even friends 6 people video chat insta...,2
...,...,...
28575,speaking poland first ever ministerial promote...,1
22032,president trump putting americafirst comes tra...,1
41444,best action caught 📸 »,2
8559,bradbainum 3rd time senmcsallyaz votes allow 3...,0


In [117]:
test_df=pd.DataFrame()
test_df["value"]=x_test
test_df["label"]=y_test
test_df

Unnamed: 0,value,label
6883,thedemocrats 🚨 problems polls today call voter...,0
41903,listen mavens marvel ' mighty menace,2
21878,gopchairwoman bipartisan opposition pelosi sch...,1
30928,hamas rocket attacks threatening lives innocen...,1
33034,get free fake news filter ⁦ scottadamssays ⁩ s...,1
...,...,...
28551,realdonaldtrump hope enjoying president ’ day ...,1
50273,“ afghanistan immensely fascinating place repo...,2
25602,dem harassment kellyannepolls ridiculous multi...,1
17149,folks coming together across country host watc...,0


In [118]:
train_df["value"]=train_df["label"].apply(add_prefix)+" "+train_df["value"]
test_df["value"]=test_df["label"].apply(add_prefix)+" "+test_df["value"]

In [119]:
train_df["value"]

14706    __label__0 repcolinallred affordable care act ...
10473    __label__0 pattymurray ’ hearing — idea americ...
16801    __label__0 donald trump ' muslim ban direct be...
5871     __label__0 last night — hours kavanaugh ’ hear...
49117    __label__2 connect even friends 6 people video...
                               ...                        
28575    __label__1 speaking poland first ever minister...
22032    __label__1 president trump putting americafirs...
41444                    __label__2 best action caught 📸 »
8559     __label__0 bradbainum 3rd time senmcsallyaz vo...
48402    __label__2 “ hair crown wear pride ” thisweeko...
Name: value, Length: 41011, dtype: object

In [134]:
train_df['value'].to_csv("tweets_train_corpus.csv",index=False)
test_df['value'].to_csv("tweets_test_corpus.csv",index=False)

In [135]:
model = fasttext.train_supervised('tweets_train_corpus.csv')

In [136]:
#model = fasttext.train_supervised('tweets_train_corpus.csv',lr=0.7,epoch=50, wordNgrams=2,bucket=200000)

In [137]:
model.test('tweets_test_corpus.csv')

(10253, 0.929191456159173, 0.929191456159173)

In [138]:
print_results(*model.test('tweets_test_corpus.csv'))

N	10253
P@1	0.929
R@1	0.929


In [139]:
score=model.test('tweets_test_corpus.csv')

In [140]:
precision=score[1]
recall=score[2]

In [132]:
F1 = 2 * (precision * recall) / (precision + recall)

In [133]:
F1

0.9359211937969375

In [538]:
# Skipgram model :
#model = fasttext.train_unsupervised('tweets_corpus.csv', model='skipgram')

# or, cbow model :
#model = fasttext.train_unsupervised('tweets_corpus.csv', model='cbow')

In [539]:
#model.get_word_vector("the")

array([ 0.4146808 , -0.3514698 ,  0.99756813, -0.34905446, -0.5383298 ,
        0.26337886, -1.3346989 ,  0.2696739 ,  0.01227002, -0.11182483,
       -0.56319416,  0.5641128 ,  0.44509768, -0.6356219 ,  0.38990122,
       -0.09339877,  0.1606257 ,  0.72620386,  1.0777328 ,  0.49337825,
       -0.27959353, -0.1525427 ,  0.97561735,  0.55205953, -0.02131497,
       -0.21529667,  0.13206959,  0.16016956,  0.06136292,  0.13445513,
       -0.4657135 ,  0.6944296 ,  0.12012708, -0.78917384, -0.39463037,
        0.11260908,  0.7887055 ,  0.28799686,  0.08513959, -0.09384476,
       -0.08972653,  0.71311516,  0.16640273, -0.61693054, -0.4716776 ,
        0.3921702 , -0.48775336,  0.4053595 ,  0.18597561, -0.14477292,
        0.00245656, -0.2616625 ,  0.3431946 , -0.05408857,  0.08822475,
       -0.0115694 ,  0.2421625 ,  0.04905983,  0.44104955, -0.55255514,
        0.36913985,  0.61588585,  0.70548475,  0.4143827 ,  0.879292  ,
       -0.5183598 ,  0.5834403 , -0.11093776,  1.1930081 , -0.15

In [540]:
#model = fasttext.train_unsupervised('tweets_corpus.csv', minn=2, maxn=5, dim=300)

In [541]:
#model.get_nearest_neighbors('asparagus')

[(0.7268996238708496, 'nicaragua'),
 (0.7136082649230957, 'aoc'),
 (0.7117641568183899, 'harass'),
 (0.704505443572998, 'disarray'),
 (0.7032791972160339, 'horrendous'),
 (0.7001626491546631, 'eidmubarak'),
 (0.6917797327041626, 'agendas'),
 (0.690413773059845, 'ツ'),
 (0.6860843300819397, '⇒'),
 (0.6822501420974731, '…”')]