In [None]:
import pandas as pd
import random
import numpy as np
import re
import pickle
import nltk
from nltk.classify.scikitlearn import SklearnClassifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import cross_validation
from scipy.sparse import csr_matrix

# Load processed single hashtag tweets

In [1]:
f = open('df_processed_single_hashtag.pickle', 'rb')
df = pickle.load(f)
f.close()

# Develop two training/testing sets for text classification

### Name search functions

In [None]:
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    if word in text.split():
        return True
    else:
        return False

In [None]:
# returns 1 if at least one of the words in the list is in the text and 0 otherwise
def list_words(words,text):
    ind = 0
    for item in words:
        if word_in_text(item,text) == True:
            return 1
    return 0       

In [None]:
Hillary_names = ['clinton','hilari','she','her']#'hilari' is how all versions of 'hillary' appear after text processing
Trump_names = ['trump','donald','he','his']

### Utilize main campaign driven hashtags

##### #ImWithHer tweets separated into pro-Clinton and anti-Trump

In [None]:
For_Hillary_df = pd.DataFrame()
For_Hillary_df['text'] = df[(df['hashtags'] == 'imwithher')]['processed_text']

In [None]:
For_Hillary_df['pos_hillary'] = For_Hillary_df['text'].apply(lambda tweet: list_words(Hillary_names,tweet))
For_Hillary_df['neg_trump'] = For_Hillary_df['text'].apply(lambda tweet: list_words(Trump_names,tweet))

#### Isolate single subject tweets

In [None]:
#take those with Hillary's names and not Trump's names
Pos_Hillary_text = For_Hillary_df[(For_Hillary_df['pos_hillary'] == 1) & (For_Hillary_df['neg_trump'] == 0)]['text']
#take those with Trump's names and not Hillary's names
Neg_Trump_text = For_Hillary_df[(For_Hillary_df['pos_hillary'] == 0) & (For_Hillary_df['neg_trump'] == 1)]['text']

##### #MakeAmericaGreatAgain tweets separated into pro-Trump and anti-Clinton

In [None]:
For_Donald_df = pd.DataFrame()
For_Donald_df['text'] = df[(df['hashtags'] == 'makeamericagreatagain') | (df['hashtags'] == 'maga')]['processed_text']

In [None]:
For_Donald_df['pos_trump'] = For_Donald_df['text'].apply(lambda tweet: list_words(Trump_names,tweet))
For_Donald_df['neg_hillary'] = For_Donald_df['text'].apply(lambda tweet: list_words(Hillary_names,tweet))

#### Isolate single subject tweets

In [None]:
#take those with Trump's names and not Hillary's names
Pos_Trump_text = For_Donald_df[(For_Donald_df['pos_trump'] == 1) & (For_Donald_df['neg_hillary'] == 0)]['text']
#take those with Hillary's names and not Trump's names, there are 1112 of them
Neg_Hillary_text = For_Donald_df[(For_Donald_df['pos_trump'] == 0) & (For_Donald_df['neg_hillary'] == 1)]['text']

### Make training and testing sets

#### Clinton

In [None]:
Rand_Pos_Hillary = [X.lower() for X in Pos_Hillary_text.tolist()]
random.shuffle(Rand_Pos_Hillary)

Rand_Neg_Hillary = [X.lower() for X in Neg_Hillary_text.tolist()]
random.shuffle(Rand_Neg_Hillary)

In [None]:
#80% training, 20% tesitng
Clinton_X_train = Rand_Pos_Hillary[:int(.8*Hillary_length)] + Rand_Neg_Hillary[:int(.8*Hillary_length)]
Clinton_X_test = Rand_Pos_Hillary[int(.8*Hillary_length):] + Rand_Neg_Hillary[int(.8*Hillary_length):]

In [None]:
#corresponding training/testing target sets
Clinton_y_train = [1 for x in Rand_Pos_Hillary[:int(.8*Hillary_length)]] + [0 for x in Rand_Neg_Hillary[:int(.8*Hillary_length)]]
Clinton_y_test = [1 for x in Rand_Pos_Hillary[int(.8*Hillary_length):]] + [0 for x in Rand_Neg_Hillary[int(.8*Hillary_length):]]

#### Trump

In [None]:
Rand_Pos_Trump = [X.lower() for X in Pos_Trump_text.tolist()]
random.shuffle(Rand_Pos_Trump)

Rand_Neg_Trump = [X.lower() for X in Neg_Trump_text.tolist()]
random.shuffle(Rand_Neg_Trump)

In [None]:
#80% training, 20% tesitng
Trump_X_train = Rand_Pos_Trump[:int(.8*Trump_length)] + Rand_Neg_Trump[:int(.8*Trump_length)]
Trump_X_test = Rand_Pos_Trump[int(.8*Trump_length):] + Rand_Neg_Trump[int(.8*Trump_length):]

In [None]:
#corresponding training/testing target sets
Trump_y_train = [1 for x in Rand_Pos_Trump[:int(.8*Trump_length)]] + [0 for x in Rand_Neg_Trump[:int(.8*Trump_length)]]
Trump_y_test = [1 for x in Rand_Pos_Trump[int(.8*Trump_length):]] + [0 for x in Rand_Neg_Trump[int(.8*Trump_length):]]

# Training two text classifiers

# For single hashtag single subject tweets

#### Clinton

In [None]:
#count for feature appearence as opposed to total count due to small length of tweets
C_vectorizer = CountVectorizer(ngram_range=(1,2), binary =True)

In [None]:
Clinton_training_features = C_vectorizer.fit_transform(Clinton_X_train)
Clinton_testing_features = C_vectorizer.transform(Clinton_X_test)

#### Trump

In [None]:
#count for feature appearence as opposed to total count due to small length of tweets
T_vectorizer = CountVectorizer(ngram_range=(1,2), binary =True)

In [None]:
Trump_training_features = T_vectorizer.fit_transform(Trump_X_train)
Trump_testing_features = T_vectorizer.transform(Trump_X_test)

### Cross validated grid search

In [None]:
#parameters for logistic regression grid search
logistic_param_grid = dict(C=np.logspace(np.log10(0.01) , np.log10(50000) , num=400))

#### Clinton

In [None]:
Clinton_logistic_grid_search = GridSearchCV(LogisticRegression(), param_grid=logistic_param_grid,cv=15)

In [None]:
Clinton_logistic_grid_search.fit(Clinton_training_features, Clinton_y_train)

In [None]:
Clinton_logistic_classifier = Clinton_logistic_grid_search.best_estimator_
print Clinton_logistic_classifier

In [None]:
Clinton_logistic_classifier_predict_class = Clinton_logistic_classifier.predict(Clinton_testing_features)
print 'Logistic Regression:'
print accuracy_score(Clinton_logistic_classifier_predict_class,Clinton_y_test)

#### Trump

In [None]:
Trump_logistic_grid_search = GridSearchCV(LogisticRegression(), param_grid=logistic_param_grid,cv=15)

In [None]:
Trump_logistic_grid_search.fit(Trump_training_features, Trump_y_train)

In [None]:
Trump_logistic_classifier = Trump_logistic_grid_search.best_estimator_
print Trump_logistic_classifier

In [None]:
Trump_logistic_classifier_predict_class = Trump_logistic_classifier.predict(Trump_testing_features)
print 'Logistic Regression:'
print accuracy_score(Trump_logistic_classifier_predict_class,Trump_y_test)

### Measure accuracy on pos/neg components of each classifier

In [None]:
def pos_neg_acc(predict_class,y_test):
    pos_correct_count = 0
    neg_correct_count = 0
    for i in xrange(len(y_test)):
        if predict_class[i] != y_test[i]:
            continue
        else:
            if y_test[i] == 1:
                pos_correct_count = pos_correct_count + 1
            else:
                neg_correct_count = neg_correct_count + 1
    return (float(pos_correct_count)/np.sum(y_test)),float(neg_correct_count)/(len(y_test)-np.sum(y_test))

### Clinton

In [None]:
print pos_neg_acc(Clinton_logistic_classifier_predict_class,Clinton_y_test)

### Trump

In [None]:
print pos_neg_acc(Trump_logistic_classifier_predict_class,Trump_y_test)

# Export classifiers

In [None]:
f = open('Clinton_logistic_classifier.pickle', 'wb')
pickle.dump(Clinton_logistic_classifier, f)
f.close()

In [None]:
f = open('Clinton_vectorizer.pickle', 'wb')
pickle.dump(C_vectorizer, f)
f.close()

In [None]:
f = open('Trump_logistic_classifier.pickle', 'wb')
pickle.dump(Trump_logistic_classifier, f)
f.close()

In [None]:
f = open('Trump_vectorizer.pickle', 'wb')
pickle.dump(T_vectorizer, f)
f.close()