In [186]:
import re
import bs4
import nltk
import numpy
import pandas
import string
import sklearn

from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [187]:
contactionList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'll": "he will",
  "he's": "he is",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'll": "I will",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'll": "it will",
  "it's": "it is",
  "let's": "let us",
  "might've": "might have",
  "must've": "must have",
  "mustn't": "must not",
  "needn't": "need not",
  "never'll":"never will",
  "she'd": "she would",
  "she'll": "she will",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "that'd": "that would",
  "that's": "that is",
  "there's": "there is",
  "they'd": "they would",
  "they'll": "they will",
  "they're": "they are",
  "they've": "they have",
  "wasn't": "was not",
  "we'll": "we will",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "where'd": "where did",
  "where's": "where is",
  "who'll": "who will",
  "who's": "who is",
  "will've": "will have",
  "won't": "will not",
  "would've": "would have",
  "wouldn't": "would not",
  "you'd": "you had",
  "you'll": "you will",
  "you're": "you are",
  "you've": "you have"
}

def expandContractions(text):
    c_re = re.compile('(%s)' % '|'.join(contactionList.keys()))
    def replace(match):
        return contactionList[match.group(0)]
    return c_re.sub(replace, text)

In [188]:
def setstopwords():
    stop_words = stopwords.words('english')
    stop_words.append('atuser')
    stop_words.append('retweet')
    stop_words.append('\'s')
    stop_words.append('URL')
    stop_words.append('obama')
    stop_words.append('romney')
    stop_words.remove('no')
    stop_words.remove('nor')
    stop_words.remove('not')
    return stop_words

In [189]:
def preprocessing(tweet):
    stop_words = setstopwords()
    stemmer = SnowballStemmer("english")
    try:
        tweet = re.sub('[0-9]+', ' ', tweet)
        tweet = BeautifulSoup(tweet, "lxml")
        tweet = tweet.get_text()
        tweet = re.sub('@[^\s]+','ATUSER',tweet)
        tweet = re.sub('RT','RETWEET', tweet)
        tweet = tweet.lower()
        tweet = expandContractions(tweet)
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','URL',tweet)
        tweet = re.sub('[\s]+', ' ', tweet)
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
        tweet = re.sub(r'(.)\1+', r'\1\1', tweet) 
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
        tweet = re.sub('['+string.punctuation+']', ' ', tweet)
        tweet = tweet.strip('\'"')
        tweet = " ".join(tweet.split())
        tweet = " ".join([word for word in tweet.split() if word not in stop_words])
        tweet = " ".join([ stemmer.stem(word) for word in tweet.split()])
        return tweet
    except:
        return []

In [190]:
def getTweets(dataset):
    tweets = []
    tweet_class = []
    for index, row in dataset.iterrows():
        sentiment_class = row['Class']
        if sentiment_class == 1 or sentiment_class == -1 or sentiment_class == 0:
            tokenized_tweet = preprocessing(row['Anootated tweet'])
            if len(tokenized_tweet) > 0:
                tweets.append(tokenized_tweet)
                tweet_class.append(sentiment_class)
    return [tweets,tweet_class]

In [191]:
def buildclassifier(tweets,classifier):
    kf = KFold(10)
    for training_data, testing_data in kf.split(tweets):
        testing_dataset = tweets[testing_data[0]:testing_data[len(testing_data)-1]]
        training_dataset = None
        if testing_data[0] == 0:
            training_dataset = tweets[training_data[0]:training_data[len(training_data)-1]]
        else:
            training_dataset1 = tweets[training_data[0]:training_data[testing_data[0]-1]]
            training_dataset2 = tweets[testing_data[len(testing_data)-1]+1:]
            training_dataset = training_dataset1.append(training_dataset2)
        [train_tweets,train_tweet_class] = getTweets(training_dataset)
        classifier.fit(train_tweets,train_tweet_class)

In [192]:
def classify(classifier,test):
    [test_tweets,test_tweet_class] = getTweets(test)
    prediction = classifier.predict(test_tweets)
    report = classification_report(test_tweet_class, prediction)
    accuracy = accuracy_score(test_tweet_class, prediction)
    print(report)
    print('Accuracy : ', accuracy)
    print('======================================================')

In [193]:
def LinearSVCClassifier(train,test):
    classifier = Pipeline([
                ('vectorizer', TfidfVectorizer(min_df=10,max_df=0.8,sublinear_tf=True,use_idf=True)),
                ('clf', OneVsRestClassifier(LinearSVC()))])
    print('Linear SVM')
    buildclassifier(train,classifier)
    classify(classifier,test)

In [169]:
def LogisticRegressionClassifier(train,test):
    classifier = Pipeline([
                ('vectorizer', TfidfVectorizer(min_df=10,max_df=0.8,sublinear_tf=True,use_idf=True)),
                ('clf', LogisticRegression())])
    print('Logistic Regression')
    buildclassifier(train,classifier)
    classify(classifier,test)

In [170]:
def BernoulliNBClassifier(train,test):
    classifier = Pipeline([
                ('vectorizer', TfidfVectorizer(min_df=10,max_df=0.8,sublinear_tf=True,use_idf=True)),
                ('clf', BernoulliNB())])
    print('Bernoulli NB')
    buildclassifier(train,classifier)
    classify(classifier,test)

In [171]:
def MultinomialNBClassifier(train,test):
    classifier = Pipeline([
                ('vectorizer', TfidfVectorizer(min_df=10,max_df=0.8,sublinear_tf=True,use_idf=True)),
                ('clf', MultinomialNB())])
    print('Multinomial NB')
    buildclassifier(train,classifier)
    classify(classifier,test)

In [None]:
def RandomForest(train,test):
    classifier = Pipeline([
                ('vectorizer', TfidfVectorizer(min_df=10,max_df=0.8,sublinear_tf=True,use_idf=True)),
                ('clf', RandomForestClassifier(n_estimators=10))])
    print('RandomForestClassifier')
    buildclassifier(train,classifier)
    classify(classifier,test)

In [197]:
def AdaBoost(train,test):
    classifier = Pipeline([
                ('vectorizer', TfidfVectorizer(min_df=10,max_df=0.8,sublinear_tf=True,use_idf=True)),
                ('clf', AdaBoostClassifier(n_estimators=100))])
    print('AdaBoostClassifier')
    buildclassifier(train,classifier)
    classify(classifier,test)

In [None]:
def NaiveBayesClassifierNLTK(train,test):
    

In [198]:
def ObamaClassifiers(train,test):
    print('======================================================')
    print('               Classify Obama Tweets                  ')
    print('======================================================')
    NaiveBayesClassifierNLTK(train,test)
    LinearSVCClassifier(train,test)
    LogisticRegressionClassifier(train,test)
    BernoulliNBClassifier(train,test)
    MultinomialNBClassifier(train,test)
    RandomForest(train,test)
    AdaBoost(train,test)
    print('======================================================')
    print('======================================================')

In [199]:
def RomneyClassifiers(train,test):
    print('======================================================')
    print('               Classify Romney Tweets                 ')
    print('======================================================')
    LinearSVCClassifier(train,test)
    LogisticRegressionClassifier(train,test)
    BernoulliNBClassifier(train,test)
    MultinomialNBClassifier(train,test)
    RandomForest(train,test)
    AdaBoost(train,test)
    print('======================================================')
    print('======================================================')

In [200]:
xlsx = pandas.ExcelFile("../Data/training-Obama-Romney-tweets.xlsx")
ObamaTrainSet = pandas.read_excel(xlsx, "ObamaTrain", parse_cols=[3,4])
ObamaTestSet = pandas.read_excel(xlsx, "ObamaTest", parse_cols=[3,4])
RomneyTrainSet = pandas.read_excel(xlsx, "RomneyTrain", parse_cols=[3,4])
RomneyTestSet = pandas.read_excel(xlsx, "RomneyTest", parse_cols=[3,4])
ObamaClassifiers(ObamaTrainSet,ObamaTestSet)
RomneyClassifiers(RomneyTrainSet,RomneyTestSet)

               Classify Obama Tweets                  
Linear SVM
             precision    recall  f1-score   support

       -1.0       0.54      0.56      0.55       143
        0.0       0.60      0.54      0.57       182
        1.0       0.50      0.54      0.52       130

avg / total       0.55      0.55      0.55       455

Accuracy :  0.547252747253
Logistic Regression
             precision    recall  f1-score   support

       -1.0       0.52      0.55      0.53       143
        0.0       0.59      0.56      0.58       182
        1.0       0.52      0.53      0.52       130

avg / total       0.55      0.55      0.55       455

Accuracy :  0.547252747253
Bernoulli NB
             precision    recall  f1-score   support

       -1.0       0.57      0.53      0.55       143
        0.0       0.59      0.46      0.52       182
        1.0       0.46      0.64      0.54       130

avg / total       0.55      0.53      0.53       455

Accuracy :  0.534065934066
Multinomial NB
 