## Bring in the txt files into Pandas dataframe

In [162]:
# import dependencies
import pandas as pd
from collections import defaultdict
from pathlib import Path
import nltk as nl
from nltk.tokenize import word_tokenize
import re
from nltk.tokenize.casual import TweetTokenizer
import numpy as np

In [3]:
nl.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Hari
[nltk_data]     Ravella\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import os 
os.chdir("C:\\Users\\Hari Ravella\\Downloads\\sentiment_classification-master")
#set directory path
my_dir_path = "tweet/train/positive"

In [5]:
# create list to store text
results = defaultdict(list)

In [6]:
# loop through files and append text to list
for file in Path(my_dir_path).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results["text"].append(file_open.read())

In [7]:
# read the list in as a dataframe
df_pos = pd.DataFrame(results)

In [8]:
# take a look at dataframe
df_pos.head()

Unnamed: 0,text
0,@SouthwestAir I would appreciate that. Thank ...
1,@USAirways thank you very much.\n
2,@JetBlue I'm all set. About to fly. Not bad fo...
3,@SouthwestAir I got a flight at 11:55am on Thu...
4,@AmericanAir you're my early frontrunner for b...


In [9]:
#set directory path
my_dir_path_neg = "tweet/train/negative"

# create list to store text
results_neg = defaultdict(list)

# loop through files and append text to list
for file in Path(my_dir_path_neg).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results_neg["text"].append(file_open.read())
        
# read the list in as a dataframe
df_neg = pd.DataFrame(results_neg)
df_neg.head()

Unnamed: 0,text
0,@united Really....you charge me $25 to check a...
1,.@JetBlue thanks for making an effort. Credit ...
2,@united plz don't advertise wifi if it's not g...
3,@SouthwestAir - 800 is not int'l friendly\n
4,@USAirways thanks for a subpar travel experien...


In [10]:
#add sentiment to both datasets and then combine them for test data 1 for positive and 0 for negative
df_pos['Sentiment']=1
df_neg['Sentiment']=0
frames = [df_pos, df_neg]
df = pd.concat(frames)

In [11]:
df.shape

(4181, 2)

## Create Vocabulary

In [12]:
# increase column width to see more of the tweets
pd.set_option('max_colwidth', 140)

# reshuffle the tweets to see both pos and neg in random order
df = df.sample(frac=1).reset_index(drop=True)

# explore top 5 rows
df.head(5)

Unnamed: 0,text,Sentiment
0,@USAirways customer service at its best! Rachel S. took great care of us at the PHX airport. http://t.co/HG7vEqhGHy\n,1
1,@united counter agents at RDU deserve a medal. #thankyou\n,1
2,@SouthwestAir is that the same reliable system couldn't find my info and then said it refund my credit card ?\n,0
3,@AmericanAir i got a new reservation for tomorrow. Thanks!\n,1
4,@USAirways @nm4agoodlife 5 hours on hold and no answer . Guess the synergy of a merger was really planned out\n,0


In [13]:
# Remove any markup tags (HTML), all the mentions of handles(starts with '@') and '#' character
def cleantweettext(raw_html):
    pattern = re.compile('<.*?>')
    cleantext = re.sub(pattern, '', raw_html)
    cleantext = " ".join(filter(lambda x:x[0]!='@', cleantext.split()))
    cleantext = cleantext.replace('#', '')
    return cleantext

In [14]:
def removeat(text):
    atlist=[]
    for word in text:
        pattern = re.compile('^@')
        if re.match(pattern,word):
            #cleantext1 = re.sub(pattern, word[1:], word)
            atlist.append(word[1:])
        else:
            atlist.append(word)
    return atlist

In [15]:
def tolower(text):
    lowerlist=[]
    for word in text:
        pattern = re.compile('[A-Z][a-z]+')
        if re.match(pattern,word):
            cleantext1 = re.sub(pattern, word.lower(), word)
            lowerlist.append(cleantext1)
        else:
            lowerlist.append(word)
    return lowerlist

In [16]:
cleantweet= []
for doc in df.text:
    cleantweet.append(cleantweettext(doc))


tokentweet=[]
df.text= cleantweet
for doc in df.text:
    tokentweet.append(TweetTokenizer().tokenize(doc))
    
df.text= tokentweet

In [17]:
removeattweet=[]
for doc in df.text:
    removeattweet.append(removeat(doc))
df.text =removeattweet

In [18]:
removeattweet[2]

['is',
 'that',
 'the',
 'same',
 'reliable',
 'system',
 "couldn't",
 'find',
 'my',
 'info',
 'and',
 'then',
 'said',
 'it',
 'refund',
 'my',
 'credit',
 'card',
 '?']

In [19]:
lowertweet=[]
for doc in df.text:
    lowertweet.append(tolower(doc))
df.text = lowertweet

In [20]:
lowertweet[2]

['is',
 'that',
 'the',
 'same',
 'reliable',
 'system',
 "couldn't",
 'find',
 'my',
 'info',
 'and',
 'then',
 'said',
 'it',
 'refund',
 'my',
 'credit',
 'card',
 '?']

In [21]:
tweets=[]
for x in df.text:
    tweet = ''
    for word in x:
        tweet += word+' '
    tweets.append(word_tokenize(tweet))
df.text= tweets

In [22]:
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,text,Sentiment
0,"[got, it, ., thanks, the, quick, reply, .]",1
1,"[I, ’, ve, filled, out, the, form, twice, ., no, email, ., I, have, a, lost, item, code, ., can, you, verify, it, was, received, ?]",0
2,"[bluemanity, loves, this, ., have, a, great, time, flying, this]",1
3,"[ill, check, it, out, appreciate, the, response, regardless, .]",1
4,"[you, service, agents, at, MCO, are, great, but, their, are, not, enough, of, them, working, right, now, !]",0
5,"[agent, in, LAS, letting, 20, customers, know, they, ca, n't, help, them, rebook, delayed, flight, to, DEN, unfriendlyskies, http, :, //...",0
6,"[I, 'm, trying, to, register, since, 12:00, do, n't, want, to, be, separated, from, my, brother, during, the, 15hours, flight, !, there'...",0
7,"[been, on, hold, over, an, hr, to, rebook, a, cancelled, flighted, flight, ., do, you, have, anyone, working, ?, ?, ?]",0
8,"[should, 've, been, on, one, of, your, flights, instead, ..., has, now, lost, our, bags, to, add, insult, to, injury]",0
9,"[I, love, the, admiral, clubs, !, thanks, hey, can, you, follow, me, ?]",1


In [23]:
#stemming
stemtweets=[]
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=False)
#ps= PorterStemmer()
for x in df.text:
    stemtweet=''
    for word in x:
        stemtweet=stemtweet+stemmer.stem(word)+' '
    stemtweets.append(word_tokenize(stemtweet))
df['stemmed']=stemtweets

In [24]:
### Finalize both the stemmed and unstemmed dataframes
df_unstemmed = df.drop(['stemmed'], axis=1)
df_unstemmed.head()

Unnamed: 0,text,Sentiment
0,"[customer, service, at, its, best, !, rachel, S, ., took, great, care, of, us, at, the, PHX, airport, ., http, :, //t.co/HG7vEqhGHy]",1
1,"[counter, agents, at, RDU, deserve, a, medal, ., thankyou]",1
2,"[is, that, the, same, reliable, system, could, n't, find, my, info, and, then, said, it, refund, my, credit, card, ?]",0
3,"[i, got, a, new, reservation, for, tomorrow, ., thanks, !]",1
4,"[5, hours, on, hold, and, no, answer, ., guess, the, synergy, of, a, merger, was, really, planned, out]",0


In [25]:
# create a df with stemmed text
df_stemmed = df.drop(['text'], axis=1)
df_stemmed.head()

Unnamed: 0,Sentiment,stemmed
0,1,"[custom, servic, at, it, best, !, rachel, s, ., took, great, care, of, us, at, the, phx, airport, ., http, :, //t.co/hg7veqhghi]"
1,1,"[counter, agent, at, rdu, deserv, a, medal, ., thankyou]"
2,0,"[is, that, the, same, reliabl, system, could, n't, find, my, info, and, then, said, it, refund, my, credit, card, ?]"
3,1,"[i, got, a, new, reserv, for, tomorrow, ., thank, !]"
4,0,"[5, hour, on, hold, and, no, answer, ., guess, the, synergi, of, a, merger, was, realli, plan, out]"


## Extract Features

In [26]:
# import dependencies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

##### Frequency Count


In [153]:
#extract text from df
text = df['text']

# initiate count vectorizer
def dummy_fun(doc):
    return doc

vectorizer = CountVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)  
vectorizer.fit(text)
freqVocab = vectorizer.vocabulary_
train_vector = vectorizer.transform(text)
len(freqVocab)

7674

In [154]:
train_vector.shape

(4181, 7674)

##### Binary Representation

In [69]:
# binaryVector = pd.Series(text).apply(pd.value_counts).fillna(0).astype(int)
# binaryVector.dtypes

In [152]:
#extract text from df
text = df['text']

# initiate count vectorizer
def dummy_fun(doc):
    return doc

vectorizer2 = CountVectorizer(binary = True, analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)  
vectorizer2.fit(text)
binaryVocab = vectorizer2.vocabulary_
train_binary_vector = vectorizer2.transform(text)
len(binaryVocab)

7674

In [79]:
train_binary_vector.toarray()

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Training

### Frequency - No stemming

In [155]:
# calculate the no. of negative sentiments
neg =  df['Sentiment']==0
NTrain = df[neg]
print(neg_count.shape)
NTrain.reset_index(drop = True, inplace = True)

# calculate the no. of positive sentiments
pos = df['Sentiment']==1
PTrain = df[pos]
print(pos_count.shape)
PTrain.reset_index(drop = True, inplace = True)

# calculate the prior for each class
priors = []
priors.append(PTrain.shape[0]/df.shape[0])
priors.append(NTrain.shape[0]/df.shape[0])

(3000, 3)
(1181, 3)


In [156]:
#Transform pos and neg tweets into seprate vectors
train_pos_vector1 = vectorizer.transform(PTrain['text'])
train_pos_vector1.shape

In [159]:
train_neg_vector1 = vectorizer.transform(NTrain['text'])
train_neg_vector1.shape

(3000, 7674)

In [212]:
sum_pos = train_pos_vector1.sum(axis = 0)
sum_neg = train_neg_vector1.sum(axis = 0)

In [214]:
bigdoc = pd.DataFrame(index = list(set(freqVocab.keys())), columns = ['pos', 'neg'])
#bigdoc.index = list(set(freqVocab.keys()))


In [220]:
#bigdoc

In [218]:
for word in freqVocab.keys():
    index = freqVocab.get(word)
    bigdoc.at[word, 'pos'] = sum_pos[:, index]
    bigdoc.at[word, 'neg'] = sum_neg[:, index]

In [84]:
# initiate count vectorizer



In [204]:
import math
def Naivebayes(data,category,vector,bigvec):
    priors=[]
    for cat in category:
        ndoc= len(data)
        nc= len(data['Sentiment']== cat)
        prior = nc/ndoc
        print(prior)
    
        priors.append(prior)
        vocab= vector
        is_cat =  data['Sentiment']== cat
        bigdoc= data[is_cat]
        bigdocvec= bigvec
        case_list= []
        for word2 in bigdocvec:
            for word in vocab:
                if(word==word2):
                    logprob= math.log(bigdocvec[word2]+1/(sum(vocab.values())+1))
            case= {word2:logprob}
            case_list.append(case)
            
        return [case_list,prior]        

In [205]:
result = Naivebayes(df, [0,1], freqVector,bigdocVector)

1.0
