In [None]:
#Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Data Preprocessing and Feature Engineering
from textblob import TextBlob
import re

# !pip install tweet-preprocessor
# !pip install textblob

import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer



import preprocessor as p

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [None]:
OBAMA = 'final-testData-no-label-Obama-tweets(1).xlsx'
ROMNEY = 'final-testData-no-label-Romney-tweets(1).xlsx'
df_obama = pd.read_excel(OBAMA, sheet_name=0, header=None,index_col=None, 
                   skiprows=[0,1], names=['tweet'])
df_romney = pd.read_excel(ROMNEY, sheet_name=0, header=None,index_col=None, 
                   skiprows=[0,1], names=['tweet'])

In [None]:
# # #Cast the results column to string since it contains both 2 & '2'
# df_obama['class'] = df_obama['class'].astype(str)
# sns.countplot(x = 'class', data = df_obama)

# df_romney

In [None]:
# df_romney['class'] = df_romney['class'].astype(str)
# sns.countplot(x = 'class', data = df_romney)

In [None]:
# # Drop rows not having 0, 1, -1
# df_obama = df_obama[df_obama['class'].isin(['0', '1', '-1'])] 
# df_romney = df_romney[df_romney['class'].isin(['0', '1', '-1'])] 

# # Print the shape of the dataframe 
# print(df_obama.shape) 
# print(df_romney.shape) 

In [None]:
# sns.countplot(x = 'class', data = df_obama)

In [None]:
df_obama

In [None]:
df_obama['tweet'] = df_obama['tweet'].astype(str)
df_obama['tweet_length'] = df_obama['tweet'].apply(lambda x: len(str(x)))

df_romney['tweet'] = df_romney['tweet'].astype(str)
df_romney['tweet_length'] = df_romney['tweet'].apply(lambda x: len(str(x)))

In [None]:
def clean_sentence(tweet):
    tweet = re.sub(re.compile('<[^>]+>'), '', tweet)
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

def clean_stopwords(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess

# lexical normalization
def normalization(tweet_list):
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    return normalized_tweet

In [None]:
df_obama['tweet_cleaned'] = df_obama['tweet'].apply(lambda x: normalization(clean_stopwords(clean_sentence(x))))
df_obama.head(10)
df_romney['tweet_cleaned'] = df_romney['tweet'].apply(lambda x: normalization(clean_stopwords(clean_sentence(x))))
df_romney.head(10)

In [None]:
df_romney.to_csv('cleaned_romney.csv')
df_obama.to_csv('cleaned_obama.csv')

In [None]:
def text_processing(x):
  new_tweet = clean_sentence(x)
  no_punc_tweet = clean_stopwords(new_tweet)
  return normalization(no_punc_tweet)

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_processing)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
msg_train, msg_test, label_train, label_test = train_test_split(df_obama['tweet'], df_obama['class'], test_size=0.1)
pipeline.fit(msg_train,label_train)


predictions = pipeline.predict(msg_test)

print(classification_report(predictions,label_test))
print ('\n')
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

In [None]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_processing)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', SGDClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

msg_train, msg_test, label_train, label_test = train_test_split(df_obama['tweet'], df_obama['class'], test_size=0.2)
pipeline.fit(msg_train,label_train)


predictions = pipeline.predict(msg_test)

print(classification_report(predictions,label_test))
print ('\n')
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))

In [None]:
df_obama

In [None]:
df_romney