##### Babaahmadi-narges
### Importing needed libraries

In [79]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk import word_tokenize
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression

### Reading the dataset
Here I read the dataset, But each tweet is not in one cell and it can be disturbed in different columns, and that's why I combined all pieces and converted my dataset such that each tweet is in one cell.

In [41]:
df = pd.read_csv("SandersPosNeg.csv", header=None, sep="\t")
df.rename(columns={0: "label", 1: "tweet"}, inplace=True)
df.head()

Unnamed: 0,label,tweet
0,0,RT @cjwallace03: So apparently @apple put MB c...
1,0,RT @Jewelz2611 @mashable @apple iphones r 2 ex...
2,0,@mashable @apple iphones r 2 expensive. Most w...
3,0,THiS IS WHAT WiLL KiLL APPLE http://t.co/72Jw4...
4,4,Now all @Apple has to do is get swype on the i...


### preprocessing
Now it's time for cleaning the data.\
The first step is to **remove the punctutions and numbers**.

In [42]:
def punctuation(tweet):
    ret_tweet = ""
    for i in tweet:
        if (i not in string.punctuation) and (i not in "0123456789"):
            ret_tweet+=i
    return ret_tweet

In [43]:
df['tweet']= df['tweet'].apply(punctuation)
df.head()

Unnamed: 0,label,tweet
0,0,RT cjwallace So apparently apple put MB cap on...
1,0,RT Jewelz mashable apple iphones r expensive ...
2,0,mashable apple iphones r expensive Most went ...
3,0,THiS IS WHAT WiLL KiLL APPLE httptcoJwzc RiP A...
4,4,Now all Apple has to do is get swype on the ip...


The next step is to **convert all letters into lowercase**.\
I do this to avoid counting one word multiple times as different words.

In [44]:
df['tweet']= df['tweet'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,label,tweet
0,0,rt cjwallace so apparently apple put mb cap on...
1,0,rt jewelz mashable apple iphones r expensive ...
2,0,mashable apple iphones r expensive most went ...
3,0,this is what will kill apple httptcojwzc rip a...
4,4,now all apple has to do is get swype on the ip...


**Removing extra whitespace** step:

In [45]:
def whitespace(tweet):
    return  " ".join(tweet.split())

In [46]:
df['tweet']=df['tweet'].apply(whitespace)
df.head()

Unnamed: 0,label,tweet
0,0,rt cjwallace so apparently apple put mb cap on...
1,0,rt jewelz mashable apple iphones r expensive m...
2,0,mashable apple iphones r expensive most went w...
3,0,this is what will kill apple httptcojwzc rip a...
4,4,now all apple has to do is get swype on the ip...


The next step is to **tokenize** all tweets:\
Tokenization is the process of splitting text into pieces called tokens.\
I did tokenization using nltk library.

In [47]:
df['tweet']=df['tweet'].apply(lambda X: word_tokenize(X))
df.head()

Unnamed: 0,label,tweet
0,0,"[rt, cjwallace, so, apparently, apple, put, mb..."
1,0,"[rt, jewelz, mashable, apple, iphones, r, expe..."
2,0,"[mashable, apple, iphones, r, expensive, most,..."
3,0,"[this, is, what, will, kill, apple, httptcojwz..."
4,4,"[now, all, apple, has, to, do, is, get, swype,..."


Tweets might have **spelling errors** so ,in order to maximize our accuracy, I corrected the errors using python package pyspellchecker. 

In [48]:
def spelling(tweet):
    checker = SpellChecker()
    return_array = []
    for i in tweet:
        correct = checker.correction(i)
        return_array.append(correct)
    
    return return_array

In [49]:
df['tweet']=df['tweet'].apply(lambda X: spelling(X))
df.head()

Unnamed: 0,label,tweet
0,0,"[it, wallace, so, apparently, apple, put, my, ..."
1,0,"[it, jewels, washable, apple, iphones, i, expe..."
2,0,"[washable, apple, iphones, i, expensive, most,..."
3,0,"[this, is, what, will, kill, apple, httptcojwz..."
4,4,"[now, all, apple, has, to, do, is, get, swipe,..."


One important step in preprocessing is to **remove the stop words**.\
Stop words are a set of commonly used words in a language than don't really effect our answers.\
some examples of stop words are:\
* i
* me
* we
* you

In [50]:
def stopwords_f(tweet):
    return_array = []
    for t in tweet:
        if t not in stopwords_a:
            return_array.append(t)
            
    return return_array
stopwords_a = stopwords.words('english')

In [51]:
df['tweet']=df['tweet'].apply(lambda X: stopwords_f(X))
df.head()

Unnamed: 0,label,tweet
0,0,"[wallace, apparently, apple, put, cap, sums, n..."
1,0,"[jewels, washable, apple, iphones, expensive, ..."
2,0,"[washable, apple, iphones, expensive, went, ht..."
3,0,"[kill, apple, httptcojwzc, rip, apple]"
4,4,"[apple, get, swipe, iphone, crack, iphone]"


Now it's time for **Stemming** and **Lemmatization**.\
Stemming means to cut off the end or the beginning of the word, taking into account a list of common prefixes and suffixes that can be found in an inflected word.\
Lemization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language.\
I used lemmatization here.

In [52]:
def lemmatization(tweet):
    wordnet_lemmatizer = WordNetLemmatizer()
    
    lemmatize_words = np.vectorize(wordnet_lemmatizer.lemmatize)
    lemmatized_array = lemmatize_words(tweet)
    
    return lemmatized_array

In [53]:
df['tweet']=df['tweet'].apply(lambda X: lemmatization(X))
df.head()

Unnamed: 0,label,tweet
0,0,"[wallace, apparently, apple, put, cap, sum, ne..."
1,0,"[jewel, washable, apple, iphones, expensive, w..."
2,0,"[washable, apple, iphones, expensive, went, ht..."
3,0,"[kill, apple, httptcojwzc, rip, apple]"
4,4,"[apple, get, swipe, iphone, crack, iphone]"


#### TFIDF vectorication:
As you can see tweet column includes array. I first have to convert it to string.

In [54]:
def string_maker(tweet):
    new = ' '.join(tweet)
    return new

In [55]:
df['tweet']=df['tweet'].apply(lambda X: string_maker(X))
df.head()

Unnamed: 0,label,tweet
0,0,wallace apparently apple put cap sum new updat...
1,0,jewel washable apple iphones expensive went ht...
2,0,washable apple iphones expensive went htcgalax...
3,0,kill apple httptcojwzc rip apple
4,4,apple get swipe iphone crack iphone


Now I can do tfidf vectorization.

In [56]:
vectorizer = TfidfVectorizer()
X_1 = vectorizer.fit_transform(df['tweet'])
y_1 = df['label']

### Training naive bayes using 10-fold cross validation on SandersPosNeg

In [86]:
cross_validation = ShuffleSplit(10, test_size=0.27, random_state=0)

clf = MultinomialNB()
NB_result = cross_val_score(clf, X_1, y_1, cv=cross_validation).mean()

print('accuracy:', NB_result.mean()*100)

accuracy: 82.90030211480361


**As you can see I got the accuracy of 82.9% on SandersPosNeg dataset**

### Doing the same steps for OMD

In [15]:
df2 = pd.read_csv("OMD.csv", header=None, sep="\t",encoding='mac_roman')
df2.rename(columns={0: "tweet",}, inplace=True)
df2.head()

Unnamed: 0,tweet
0,"0,Watching by myself #tweetdebate Not drinkin..."
1,"0,@ahg3 @MichDot Yeah, slime was actually my s..."
2,"0,Preparing to have a heart attack #tweetdebate,"
3,"0,no debate moderators under 50, sorry #tweet..."
4,"0,@current Now staring at black screen on http..."


In [16]:
def label(tweet) :
    m = tweet.find(',')
    return int(tweet[:m])

In [17]:
df2['label'] = df2['tweet'].apply(lambda X: label(X))
df2.head()

Unnamed: 0,tweet,label
0,"0,Watching by myself #tweetdebate Not drinkin...",0
1,"0,@ahg3 @MichDot Yeah, slime was actually my s...",0
2,"0,Preparing to have a heart attack #tweetdebate,",0
3,"0,no debate moderators under 50, sorry #tweet...",0
4,"0,@current Now staring at black screen on http...",0


In [18]:
df2['tweet']= df2['tweet'].apply(punctuation)
df2.head()

Unnamed: 0,tweet,label
0,Watching by myself tweetdebate Not drinking ...,0
1,ahg MichDot Yeah slime was actually my second ...,0
2,Preparing to have a heart attack tweetdebate,0
3,no debate moderators under sorry tweetdebate,0
4,current Now staring at black screen on httpwww...,0


In [20]:
df2['tweet']= df2['tweet'].apply(lambda x: x.lower())
df2.head()

Unnamed: 0,tweet,label
0,watching by myself tweetdebate not drinking ...,0
1,ahg michdot yeah slime was actually my second ...,0
2,preparing to have a heart attack tweetdebate,0
3,no debate moderators under sorry tweetdebate,0
4,current now staring at black screen on httpwww...,0


In [22]:
df2['tweet']=df2['tweet'].apply(whitespace)
df2.head()

Unnamed: 0,tweet,label
0,watching by myself tweetdebate not drinking wa...,0
1,ahg michdot yeah slime was actually my second ...,0
2,preparing to have a heart attack tweetdebate,0
3,no debate moderators under sorry tweetdebate,0
4,current now staring at black screen on httpwww...,0


In [24]:
df2['tweet']=df2['tweet'].apply(lambda X: word_tokenize(X))
df2.head()

Unnamed: 0,tweet,label
0,"[watching, by, myself, tweetdebate, not, drink...",0
1,"[ahg, michdot, yeah, slime, was, actually, my,...",0
2,"[preparing, to, have, a, heart, attack, tweetd...",0
3,"[no, debate, moderators, under, sorry, tweetde...",0
4,"[current, now, staring, at, black, screen, on,...",0


In [26]:
df2['tweet']=df2['tweet'].apply(lambda X: spelling(X))
df2.head()

Unnamed: 0,tweet,label
0,"[watching, by, myself, tweetdebate, not, drink...",0
1,"[ah, microdot, yeah, slime, was, actually, my,...",0
2,"[preparing, to, have, a, heart, attack, tweetd...",0
3,"[no, debate, moderators, under, sorry, tweetde...",0
4,"[current, now, staring, at, black, screen, on,...",0


In [30]:
df2['tweet']=df2['tweet'].apply(lambda X: stopwords_f(X))
df2.head()

Unnamed: 0,tweet,label
0,"[watching, tweetdebate, drinking, waiting, sta...",0
1,"[ah, microdot, yeah, slime, actually, second, ...",0
2,"[preparing, heart, attack, tweetdebate]",0
3,"[debate, moderators, sorry, tweetdebate]",0
4,"[current, staring, black, screen, httpwwwcurre...",0


In [31]:
df2['tweet']=df2['tweet'].apply(lambda X: lemmatization(X))
df2.head()

Unnamed: 0,tweet,label
0,"[watching, tweetdebate, drinking, waiting, sta...",0
1,"[ah, microdot, yeah, slime, actually, second, ...",0
2,"[preparing, heart, attack, tweetdebate]",0
3,"[debate, moderator, sorry, tweetdebate]",0
4,"[current, staring, black, screen, httpwwwcurre...",0


In [32]:
df2['tweet']=df2['tweet'].apply(lambda X: string_maker(X))
df2.head()

Unnamed: 0,tweet,label
0,watching tweetdebate drinking waiting start cr...,0
1,ah microdot yeah slime actually second choice ...,0
2,preparing heart attack tweetdebate,0
3,debate moderator sorry tweetdebate,0
4,current staring black screen httpwwwcurrentcom...,0


In [33]:
vectorizer = TfidfVectorizer()
X= vectorizer.fit_transform(df2['tweet'])
y = df2['label']

### Training naive bayes using 10-fold cross validation on OMD

In [77]:
cross_validation = ShuffleSplit(10, test_size=0.05, random_state=0)

clf = MultinomialNB(alpha = 0.4)
NB_result = cross_val_score(clf, X, y, cv=cross_validation).mean()

print('accuracy:', NB_result.mean()*100)

accuracy: 76.35416666666669


**I got the accuracy of 76.3% on OMD dataset**