## Bring in the txt files into Pandas dataframe

In [162]:
# import dependencies
import pandas as pd
from collections import defaultdict
from pathlib import Path
import nltk as nl
from nltk.tokenize import word_tokenize
import re
from nltk.tokenize.casual import TweetTokenizer
import numpy as np
import math

In [3]:
nl.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Hari
[nltk_data]     Ravella\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [293]:
import os 
os.chdir("C:\\Users\\Hari Ravella\\Downloads\\sentiment_classification-master")
#set directory path
my_dir_path = "tweet/train/positive"

In [294]:
# create list to store text
results = defaultdict(list)

In [295]:
# loop through files and append text to list
for file in Path(my_dir_path).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results["text"].append(file_open.read())

In [296]:
# read the list in as a dataframe
df_pos = pd.DataFrame(results)

In [297]:
# take a look at dataframe
df_pos.head()

Unnamed: 0,text
0,@SouthwestAir I would appreciate that. Thank you.\n
1,@USAirways thank you very much.\n
2,@JetBlue I'm all set. About to fly. Not bad for a first date with a giant metal bird machine. She even brought snacks.\n
3,@SouthwestAir I got a flight at 11:55am on Thursday but looking for something tomorrow anything available?\n
4,@AmericanAir you're my early frontrunner for best airline! #oscars2016\n


In [298]:
#set directory path
my_dir_path_neg = "tweet/train/negative"

# create list to store text
results_neg = defaultdict(list)

# loop through files and append text to list
for file in Path(my_dir_path_neg).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results_neg["text"].append(file_open.read())
        
# read the list in as a dataframe
df_neg = pd.DataFrame(results_neg)
df_neg.head()

Unnamed: 0,text
0,@united Really....you charge me $25 to check a bag and then you put it on a different flight....still Don't have my bag!!!\n
1,.@JetBlue thanks for making an effort. Credit where credit is due: flight 795 delayed 5 hours instead of 8 hours. #fwiw #loweredexpectat...
2,@united plz don't advertise wifi if it's not gonna work thanks #worstflightever\n
3,@SouthwestAir - 800 is not int'l friendly\n
4,@USAirways thanks for a subpar travel experience and it's not even over yet #stepitup\n


In [300]:
#add sentiment to both datasets and then combine them for test data 1 for positive and 0 for negative
df_pos['Sentiment']=1
df_neg['Sentiment']=0
frames = [df_pos, df_neg]
df = pd.concat(frames)

In [301]:
df.shape

(4181, 2)

## Create Vocabulary

In [302]:
# increase column width to see more of the tweets
pd.set_option('max_colwidth', 140)

# reshuffle the tweets to see both pos and neg in random order
df = df.sample(frac=1).reset_index(drop=True)

# explore top 5 rows
df.head(5)

Unnamed: 0,text,Sentiment
0,@united They finally gave in a let him on. After they threatened to send him back to Vegas on coach. Thnx.\n,1
1,@SouthwestAir I got it added thank you! :)\n,1
2,@AmericanAir lost my cats missed their flights kept them crated 30 hrs for a would-be 5 hr trip. You'll never touch my pets again.\n,0
3,@united you already have vomit so you are halfway there\n,0
4,@united - after having to now TAG MY OWN bags at the airport I was hoping they would actually arrive WITH me - here's hoping they arrive\n,0


In [303]:
# Remove any markup tags (HTML), all the mentions of handles(starts with '@') and '#' character
def cleantweettext(raw_html):
    pattern = re.compile('<.*?>')
    cleantext = re.sub(pattern, '', raw_html)
    cleantext = " ".join(filter(lambda x:x[0]!='@', cleantext.split()))
    cleantext = cleantext.replace('#', '')
    return cleantext

In [304]:
def removeat(text):
    atlist=[]
    for word in text:
        pattern = re.compile('^@')
        if re.match(pattern,word):
            #cleantext1 = re.sub(pattern, word[1:], word)
            atlist.append(word[1:])
        else:
            atlist.append(word)
    return atlist

In [305]:
def tolower(text):
    lowerlist=[]
    for word in text:
        pattern = re.compile('[A-Z][a-z]+')
        if re.match(pattern,word):
            cleantext1 = re.sub(pattern, word.lower(), word)
            lowerlist.append(cleantext1)
        else:
            lowerlist.append(word)
    return lowerlist

In [306]:
cleantweet= []
for doc in df.text:
    cleantweet.append(cleantweettext(doc))


tokentweet=[]
df.text= cleantweet
for doc in df.text:
    tokentweet.append(TweetTokenizer().tokenize(doc))
    
df.text= tokentweet

In [307]:
removeattweet=[]
for doc in df.text:
    removeattweet.append(removeat(doc))
df.text =removeattweet

In [308]:
lowertweet=[]
for doc in df.text:
    lowertweet.append(tolower(doc))
df.text = lowertweet

In [309]:
tweets=[]
for x in df.text:
    tweet = ''
    for word in x:
        tweet += word+' '
    tweets.append(word_tokenize(tweet))
df.text= tweets

In [310]:
#stemming
stemtweets=[]
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=False)
#ps= PorterStemmer()
for x in df.text:
    stemtweet=''
    for word in x:
        stemtweet=stemtweet+stemmer.stem(word)+' '
    stemtweets.append(word_tokenize(stemtweet))
df['stemmed']=stemtweets

In [311]:
### Finalize both the stemmed and unstemmed dataframes
df_unstemmed = df.drop(['stemmed'], axis=1)
df_unstemmed.head()

Unnamed: 0,text,Sentiment
0,"[they, finally, gave, in, a, let, him, on, ., after, they, threatened, to, send, him, back, to, vegas, on, coach, ., thnx, .]",1
1,"[I, got, it, added, thank, you, !, :, )]",1
2,"[lost, my, cats, missed, their, flights, kept, them, crated, 30, hrs, for, a, would-be, 5, hr, trip, ., you'll, 'll, never, touch, my, p...",0
3,"[you, already, have, vomit, so, you, are, halfway, there]",0
4,"[-, after, having, to, now, TAG, MY, OWN, bags, at, the, airport, I, was, hoping, they, would, actually, arrive, WITH, me, -, here, 's, ...",0


In [312]:
# create a df with stemmed text
df_stemmed = df.drop(['text'], axis=1)
df_stemmed.head()

Unnamed: 0,Sentiment,stemmed
0,1,"[they, final, gave, in, a, let, him, on, ., after, they, threaten, to, send, him, back, to, vega, on, coach, ., thnx, .]"
1,1,"[i, got, it, ad, thank, you, !, :, )]"
2,0,"[lost, my, cat, miss, their, flight, kept, them, crate, 30, hrs, for, a, would-b, 5, hr, trip, ., you, 'll, ll, never, touch, my, pet, a..."
3,0,"[you, alreadi, have, vomit, so, you, are, halfway, there]"
4,0,"[-, after, have, to, now, tag, my, own, bag, at, the, airport, i, was, hope, they, would, actual, arriv, with, me, -, here, 's, hope, th..."


## Extract Features

In [313]:
# import dependencies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

##### Frequency Count


In [323]:
# initiate count vectorizer
def dummy_fun(doc):
    return doc

def InitializeVectorization(text, kind):
    if kind == 'binary':
        vectorizer = CountVectorizer(binary = True, analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)  
    else:
        vectorizer = CountVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)  
    return vectorizer
    
def InitializeVocab(vectorizer, text):
    vectorizer.fit(text)
    freqVocab = vectorizer.vocabulary_
    train_vector = vectorizer.transform(text)
    len(freqVocab)
    return freqVocab   


## Training

### Frequency - No stemming

In [324]:
vectorizer = VectorizationProc(df['text'], 'binary')
freqVocab = InitializeVocab(vectorizer, df['text'])

#Create bigdoc that contains words in V, their corresponding frequencies for each class
#1.Transform pos and neg tweets into seprate vectors
train_pos_vector1 = vectorizer.transform(df[df['Sentiment']==1]['text'])
train_neg_vector1 = vectorizer.transform(df[df['Sentiment']==0]['text'])

#2. column sum of vectors(word per column)
sum_pos = train_pos_vector1.sum(axis = 0)
sum_neg = train_neg_vector1.sum(axis = 0)

#3. Initialize bigdoc as a dataframe
bigdoc = pd.DataFrame(index = list(set(freqVocab.keys())), columns = ['pos', 'neg'])

#4. get the corresponding frequency from the above matrx and set it to bigdoc
for word in freqVocab.keys():
    index = freqVocab.get(word)
    bigdoc.at[word, 'pos'] = sum_pos[:, index].item()
    bigdoc.at[word, 'neg'] = sum_neg[:, index].item()

In [331]:
bigdoc.head(10)

Unnamed: 0,pos,neg
please,19,97
b40,0,1
lights,0,2
thur,0,1
hover,0,1
cabo,1,0
hates,0,2
model,0,4
except,0,7
full,3,21


In [328]:
import math
def Naivebayes(data,category,vector,bigvec):
    logprob = bigvec.copy()
    priors = []
    for cat in category:
        ndoc= len(data)
        nc= len(data[data['Sentiment']== cat])
        prior = nc/ndoc
        print(prior)    
        priors.append(prior)
        
        if cat == 0:
            colname = 'neg'
        else:
            colname = 'pos'
        
        denominator = bigvec[colname].sum() + len(bigvec) #denominator for likelihood
        logprob[colname] = bigvec[colname].apply(lambda x:math.log((x+1)/denominator)) #likelihood
           
    return [logprob,priors]        

In [329]:
result = Naivebayes(df, [0,1], freqVector,bigdoc)

0.7175316909830184
0.28246830901698156


In [None]:
def TestNaiveBayes(testdoc, logprior, loglikelihood, category, V) returns best c
for each class c 2 C
sum[c]  logprior[c]
for each position i in testdoc
word testdoc[i]
if word 2 V
sum[c] sum[c]+ loglikelihood[word,c]
return argmaxc sum[c]