## Bring in the txt files into Pandas dataframe

In [1]:
# import dependencies
import pandas as pd
from collections import defaultdict
from pathlib import Path
import nltk as nl
from nltk.tokenize import word_tokenize
import re
from nltk.tokenize.casual import TweetTokenizer

In [2]:
nl.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\600846\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [3]:
#set directory path
my_dir_path = "tweet/train/positive"

In [4]:
# create list to store text
results = defaultdict(list)

In [5]:
# loop through files and append text to list
for file in Path(my_dir_path).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results["text"].append(file_open.read())

In [6]:
# read the list in as a dataframe
df_pos = pd.DataFrame(results)

In [7]:
# take a look at dataframe
df_pos.head()

Unnamed: 0,text
0,@SouthwestAir I would appreciate that. Thank ...
1,@USAirways thank you very much.\n
2,@JetBlue I'm all set. About to fly. Not bad fo...
3,@SouthwestAir I got a flight at 11:55am on Thu...
4,@AmericanAir you're my early frontrunner for b...


In [8]:
#set directory path
my_dir_path_neg = "tweet/train/negative"

# create list to store text
results_neg = defaultdict(list)

# loop through files and append text to list
for file in Path(my_dir_path_neg).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results_neg["text"].append(file_open.read())
        
# read the list in as a dataframe
df_neg = pd.DataFrame(results_neg)
df_neg.head()

Unnamed: 0,text
0,@united Really....you charge me $25 to check a...
1,.@JetBlue thanks for making an effort. Credit ...
2,@united plz don't advertise wifi if it's not g...
3,@SouthwestAir - 800 is not int'l friendly\n
4,@USAirways thanks for a subpar travel experien...


In [9]:
#add sentiment to both datasets and then combine them for test data 1 for positive and 0 for negative
df_pos['Sentiment']=1
df_neg['Sentiment']=0
frames = [df_pos, df_neg]
df = pd.concat(frames)

In [10]:
df.shape

(4181, 2)

## Create Vocabulary - No Stemming

In [12]:
# increase column width to see more of the tweets
pd.set_option('max_colwidth', 140)

# reshuffle the tweets to see both pos and neg in random order
df = df.sample(frac=1).reset_index(drop=True)

# explore top 5 rows
df.head(5)

Unnamed: 0,text,Sentiment
0,@united makes total sense except flight wasn't full :) I've got empty seats around me &amp; overheads were more than half open when I bo...,0
1,@JetBlue Crisis averted! Flight #69 from BOS to FLL is boarding. Let's hope the new pilots aren't Clarence Oveur and Roger Murdock. :-)\n,1
2,@united thank you\n,1
3,@SouthwestAir @Kaneshow @InternJohnRadio @mrerickv THIS IS EVERYTHING... now return that jet so we can go to Miami!\n,0
4,@SouthwestAir you're really going to let @delta and @virginamerica get the best of you? http://t.co/vUdWJm1lYB\n,0


In [13]:
# Remove any markup tags (HTML), all the mentions of handles(starts with '@') and '#' character
def cleantweettext(raw_html):
    pattern = re.compile('<.*?>')
    cleantext = re.sub(pattern, '', raw_html)
    cleantext = " ".join(filter(lambda x:x[0]!='@', cleantext.split()))
    cleantext = cleantext.replace('#', '')
    return cleantext

In [14]:
def removeat(text):
    atlist=[]
    for word in text:
        pattern = re.compile('^@')
        if re.match(pattern,word):
            #cleantext1 = re.sub(pattern, word[1:], word)
            atlist.append(word[1:])
        else:
            atlist.append(word)
    return atlist

In [15]:
def tolower(text):
    lowerlist=[]
    for word in text:
        pattern = re.compile('[A-Z][a-z]+')
        if re.match(pattern,word):
            cleantext1 = re.sub(pattern, word.lower(), word)
            lowerlist.append(cleantext1)
        else:
            lowerlist.append(word)
    return lowerlist

In [16]:
cleantweet= []
for doc in df.text:
    cleantweet.append(cleanhtml(doc))


tokentweet=[]
df.text= cleantweet
for doc in df.text:
    tokentweet.append(TweetTokenizer().tokenize(doc))
    
df.text= tokentweet

In [17]:
removeattweet=[]
for doc in df.text:
    removeattweet.append(removeat(doc))
df.text =removeattweet

In [21]:
removeattweet[2]

['united', 'thank', 'you']

In [19]:
lowertweet=[]
for doc in df.text:
    lowertweet.append(tolower(doc))
df.text = lowertweet

In [22]:
lowertweet[2]

['united', 'thank', 'you']

In [23]:
tweets=[]
for x in df.text:
    tweet = ''
    for word in x:
        tweet += word+' '
    tweets.append(word_tokenize(tweet))
df.text= tweets

In [24]:
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,text,Sentiment
0,"[USAirways, 2, days, in, a, row, I, call, and, still, ca, n't, get, anyone, on, the, phone, ., do, you, actually, have, any, employees, ?]",0
1,"[USAirways, call, dropped, &, no, call, back, ..., another, 45min, -, hour, for, another, rep, ., the, worst, CS, ever, online, by, phon...",0
2,"[united, yes, supposed, to, be, here, by, 6PM, ., I, have, a, board, meeting, tomorrow, ., fingers, crossed, .]",0
3,"[jetbluejetblue, thanks, to, the, gent, on, the, phone, who, fixed, my, BOS-MCO, flight, and, the, fee, waiver, !, A320, now, :, ), #, f...",1
4,"[americanairamericanair, AA, 100, -, good, job, overselling, this, flight, ., delayed, 90, minutes, to, deplane, the, overflow, passenge...",0
5,"[southwestairsouthwestair, hey, ..., why, do, n't, you, add, the, intl, number, to, your, error, when, checking, in, ., going, on, 6, hr...",0
6,"[southwestairsouthwestair, been, waiting, for, 70, minutes, on, hold, because, yall, cancelled, flightled, my, return, flight, ., answer...",0
7,"[southwestairsouthwestair, how, about, #, destinationdragon, tix, for, tonite, for, the, inconvenience, since, I, am, not, there, to, pl...",0
8,"[americanairamericanair, thanks, for, your, canned, response, that, makes, it, look, like, you, care, about, your, customers, ., I, 'm, ...",0
9,"[., USAirways, trying, to, get, a, partner, PNR, and, have, spent, more, than, 1, hour, on, hold, ., I, know, its, snowing, somewhere, b...",0


(3000, 3)


In [88]:
#stemming
stemtweets=[]
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=False)
#ps= PorterStemmer()
for x in df.text:
    stemtweet=''
    for word in x:
        stemtweet=stemtweet+stemmer.stem(word)+' '
    stemtweets.append(word_tokenize(stemtweet))
df['stemmed']=stemtweets

## Extract Features

In [89]:
# import dependencies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

##### Frequency Count


In [90]:
#extract text from df
text = df['text']


# initiate count vectorizer
def dummy_fun(doc):
    return doc

vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  
vectorizer.fit(text)
freqVector = vectorizer.vocabulary_
len(freqVector)

7984

##### Binary Representation

In [91]:
binaryVector = pd.Series(text).apply(pd.value_counts).fillna(0).astype(int)
binaryVector.shape

(4181, 7984)

## Training

In [104]:
# calculate the prior for each class

# calculate the no. of negative sentiments
neg =  df['Sentiment']==0
neg_count = df[neg]
print(neg_count.shape)

# calculate the no. of positive sentiments
pos = df['Sentiment']==1
pos_count = df[pos]
print(pos_count.shape)


(3000, 3)
(1181, 3)
