In [None]:
import pandas as pd
import pickle
import re
import numpy as np
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from scipy.sparse import csr_matrix

# Load trained classifiers/vectorizers

In [None]:
f = open('Clinton_logistic_classifier.pickle', 'rb')
C_logistic_classifier = pickle.load(f)
f.close()

In [None]:
f = open('Clinton_vectorizer.pickle', 'rb')
C_vectorizer = pickle.load(f)
f.close()

In [None]:
f = open('Trump_logistic_classifier.pickle', 'rb')
T_logistic_classifier = pickle.load(f)
f.close()

In [None]:
f = open('Trump_vectorizer.pickle', 'rb')
T_vectorizer = pickle.load(f)
f.close()

# Load hashtag classifiers

In [None]:
f = open('Clinton_hashtag_prob.pickle', 'rb')
Clinton_prob_dict = pickle.load(f)
f.close()

In [None]:
f = open('Trump_hashtag_prob.pickle', 'rb')
Trump_prob_dict = pickle.load(f)
f.close()

In [None]:
hashtag_list = Clinton_prob_dict.keys()

# Pull all raw tweet data from database

In [None]:
dbname = '***'
username = '***'

In [None]:
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

In [None]:
con = None
con = psycopg2.connect(database = dbname, user = username)

sql_query = """
SELECT * FROM raw_tweet_table;
"""
df = pd.read_sql_query(sql_query,con)
#df is a dataframe with columns 'created_at','text' and 'hashtags'

# Classifier

### Text pre-processing functions

In [None]:
#negate words between negative word and next punctuation by appending _neg
def negation_process(tweet):
    #add final period to ensure negation if no final punctuation
    tweet = tweet + '.'
    tweet = re.sub(r'\b(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)\b[\w\s]+[^\w\s]', 
       lambda match: re.sub(r'(\s+)(\w+)', r'\1neg_\2', match.group(0)), tweet,flags=re.IGNORECASE)
    #return tweet[:-1] to remove added period
    return tweet[:-1]

In [None]:
#Porter stemming
def stemming(tweet):
    temp = []
    for word in tweet.split():
        temp.append(stemmer.stem(word.lower()))
    return ' '.join(temp)

In [None]:
#primary text processing
def process_text(tweet_list):
    processed_tweets = []
    for tweet in tweet_list:
        tweet = re.sub(r"(?:\@|https?\://|#)\S+", "", tweet)
        tweet = tweet.replace('\'','')
        #negate
        tweet = negation_process(tweet)
        #replace non ascii characters
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
        tweet = tweet.replace('RT','')
        tweet = tweet.replace(':','')
        tweet = tweet.replace('+',' ')
        tweet = tweet.replace(',','')
        tweet = tweet.replace('.','')
        tweet = tweet.replace('\"','')
        #remove duplicate consecutive characters for standardization
        tweet = re.sub(r'(\S)\1+', r'\1', tweet)
        #add spaces before emotive punctuation, useful for bigrams
        tweet = tweet.replace('!',' !')
        tweet = tweet.replace('?',' ?')
        tweet = tweet.strip()
        tweet = stemming(tweet)
        processed_tweets.append(tweet)
    return processed_tweets

### Classifier functions

In [None]:
def classify(data):
    try:
        tweet_hashtags_split = data['hashtags'].split()
        if len(tweet_hashtags_split) >= 1:
            C_hashtag_prob = 1
            T_hashtag_prob = 1
            for tag in tweet_hashtags_split:   
                try:
                    C_hashtag_prob = C_hashtag_prob * Clinton_prob_dict[tag]
                    T_hashtag_prob = T_hashtag_prob * Trump_prob_dict[tag]
                except:
                    pass
            if C_hashtag_prob > T_hashtag_prob:
                return 'C'
            elif C_hashtag_prob < T_hashtag_prob:
                return 'T'
            else:
                try:
                    return classify_text(data['text'])
                except:
                    return 'skip'
        else:
            try:
                return classify_text(data['text'])
            except:
                return 'skip'
    except:
        try:
            return classify_text(data['text'])
        except:
            return 'skip'     

In [None]:
Hillary_names = ['clinton','hilari','she','her']#'hilari' is how all versions of 'hillary' appear after text processing
Trump_names = ['trump','donald','he','his']

In [None]:
#deals with tweets having multiple subjects and clauses
def multi_subject_split(original):
    #split on any punctuation marks besides
    separated = re.split('[?.!,:;]',original)
    Clinton_phrases = ''
    Trump_phrases = ''
    for phrase in separated:
        phrase = process_text(phrase)
        c=0
        t=0
        for word in phrase.split():
            if word in Hillary_names:
                c=1
            elif word in Trump_names:
                t=1
        if c==1 and t==1:
            continue
        elif c==1 and t==0:
            Clinton_phrases = Clinton_phrases + phrase
        elif c==0 and t==1:
            Trump_phrases = Trump_phrases + phrase
        else:
            Trump_phrases = Trump_phrases + phrase
            Clinton_phrases = Clinton_phrases + phrase
    Clinton_prob = C_logistic_classifier.predict_proba(C_vectorizer.transform([Clinton_phrases]))
    Trump_prob = T_logistic_classifier.predict_proba(T_vectorizer.transform([Trump_phrases]))
    total_prob = Clinton_prob + (np.ones(2)-Trump_prob)
    if total_prob[0][1] >= total_prob[0][0]:
        return 'C'
    else:
        return 'T'           

In [None]:
#text classifier
def classify_text(text):
    original = text
    text = process_text(text)
    c=0
    t=0
    for word in text.split():
        if word in Hillary_names:
            c=1
        elif word in Trump_names:
            t=1
    if c==1 and t==1:
        return multi_subject_split(original)
    elif c==1 and t==0:
        prob = C_logistic_classifier.predict_proba(C_vectorizer.transform([text]))
        if prob[0][1] >= prob[0][0]:
            return 'C'
        else:
            return 'T'
    elif c==0 and t==1:
        prob = T_logistic_classifier.predict_proba(T_vectorizer.transform([text]))
        if prob[0][0] >= prob[0][1]:
            return 'C'
        else:
            return 'T'
    else:
        return 'skip'

# Run classifier and make classified column in df

In [None]:
Classified = []

In [None]:
for i in xrange(len(df)):
    data = df.iloc[i]
    Classified.append(classify(data))

In [None]:
df['classified'] = Classified

In [None]:
#remove rows that should be skipped
df = df[df['classified'] != 'skip']

# Export classified: timestamp,hashtag list and classification

In [None]:
f = open('Classified_df.pickle', 'wb')
pickle.dump(df[['created_at','hashtags','classified']], f)
f.close()