In [None]:
import pandas as pd
import pickle
import re
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from scipy.sparse import csr_matrix

# Load processed single hashtag tweets

In [None]:
f = open('df_processed_single_hashtag.pickle', 'rb')
df = pickle.load(f)
f.close()

# Load trained classifiers and vectorizers

In [None]:
f = open('Clinton_logistic_classifier.pickle', 'rb')
C_logistic_classifier = pickle.load(f)
f.close()

In [None]:
f = open('Clinton_vectorizer.pickle', 'rb')
C_vectorizer = pickle.load(f)
f.close()

In [None]:
f = open('Trump_logistic_classifier.pickle', 'rb')
T_logistic_classifier = pickle.load(f)
f.close()

In [None]:
f = open('Trump_vectorizer.pickle', 'rb')
T_vectorizer = pickle.load(f)
f.close()

# Get list of hashtags and counts from single hashtag tweets

In [None]:
hashtag_multiplicites = df['hashtags'].value_counts()
hashtag_multiplicites.shape

In [None]:
#only take hashtags which are the unique take on more than 50 tweets
hashtag_multiplicites = hashtag_multiplicites[hashtag_multiplicites>=50]
Usable_hashtag_list = hashtag_multiplicites.index.tolist()

# Isolate the single subject tweets

In [None]:
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    if word in text.split():
        return True
    else:
        return False

In [None]:
#returns 1 if at least one of the words in the list is in the text and 0 otherwise
def list_words(words,text):
    ind = 0
    for item in words:
        if word_in_text(item,text) == True:
            return 1
    return 0

In [None]:
Hillary_names = ['clinton','hillari','she','her']#'hilari' is how all versions of 'hillary' appear after text processing
Trump_names = ['trump','donald','he','his']

In [None]:
#find the tweets which have only one subject
df['Clinton_names'] = df['processed_text'].apply(lambda tweet: list_words(Hillary_names,tweet))
df['Trump_names'] = df['processed_text'].apply(lambda tweet: list_words(Trump_names,tweet))
df['#_of_names'] = df['Clinton_names'] + df['Trump_names']
df = df[df['#_of_names'] == 1]

In [None]:
#separate by subject
Clinton_focused_tweets = df[df['Clinton_names']==1][['processed_text','hashtags']]
Trump_focused_tweets = df[df['Trump_names']==1][['processed_text','hashtags']]

# Make dictionaries for Clinton_focused and Trump_focused
# keys are hashtags, values list of tweet text for that hashtag

In [None]:
#For Clinton_focused_tweets
C = {}
for hashtag in Usable_hashtag_list:
    C[hashtag] = Clinton_focused_tweets[Clinton_focused_tweets['hashtags'] == hashtag]['processed_text'].tolist()


In [None]:
#For Trump_focused_tweets
T = {}
for hashtag in Usable_hashtag_list:
    T[hashtag] = Trump_focused_tweets[Trump_focused_tweets['hashtags'] == hashtag]['processed_text'].tolist()

# Classify hashtags probabilistically

In [None]:
#Dictionaries for the probability each hashtag is for clinton and trump respectively
Clinton_prob = {}
Trump_prob = {}

In [None]:
for hashtag in Usable_hashtag_list:
    try:
        C_count_A = 0
        for C_tweet in C[hashtag]:
            if C_logistic_classifier.predict_proba(C_vectorizer.transform([C_tweet]))[0][1] > .5:
                C_count_A = C_count_A + 1

        C_count_B = 0
        for T_tweet in T[hashtag]:
            if T_logistic_classifier.predict_proba(T_vectorizer.transform([T_tweet]))[0][0] > .5:
                C_count_B = C_count_B + 1
    
   
        C_prob = float(C_count_A+C_count_B)/(len(C[hashtag])+len(T[hashtag]))
        T_prob = 1-C_prob
        Clinton_prob[hashtag] = C_prob
        Trump_prob[hashtag] = T_prob
    except:
        print hashtag

# Export hashtag prob classifiers

In [None]:
f = open('Clinton_hashtag_prob.pickle', 'wb')
pickle.dump(Clinton_prob, f)
f.close()

In [None]:
f = open('Trump_hashtag_prob.pickle', 'wb')
pickle.dump(Trump_prob, f)
f.close()