## Bring in the txt files into Pandas dataframe

In [298]:
# import dependencies
import pandas as pd
from collections import defaultdict
from pathlib import Path
import nltk as nl
from nltk.tokenize import word_tokenize
import re
from nltk.tokenize.casual import TweetTokenizer

In [299]:
nl.download('punkt')

[nltk_data] Downloading package punkt to /Users/gkbytes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [300]:
#set directory path
my_dir_path = "tweet/train/positive"

In [301]:
# create list to store text
results = defaultdict(list)

In [302]:
# loop through files and append text to list
for file in Path(my_dir_path).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results["text"].append(file_open.read())

In [303]:
# read the list in as a dataframe
df_pos = pd.DataFrame(results)

In [304]:
# take a look at dataframe
df_pos.head()

Unnamed: 0,text
0,@united thanks for moving my dad on to my my mom's flight. You helped make his birthday start with #FriendlyFriday Awesomeness! 4 paws u...
1,@united Terrific. Many thanks. Looking forward to being back on UA tomorrow. Had a great flight up to Vancouver.\n
2,@AmericanAir thanks! Flight 2160 today. Great crew!\n
3,@JetBlue Thank you. Fingers crossed.\n
4,“@JetBlue: Our fleet's on fleek. http://t.co/3kVkd8yRxa” + lol wow\n


In [305]:
#set directory path
my_dir_path_neg = "tweet/train/negative"

# create list to store text
results_neg = defaultdict(list)

## loop through files and append text to list
for file in Path(my_dir_path_neg).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results_neg["text"].append(file_open.read())
        
# read the list in as a dataframe
df_neg = pd.DataFrame(results_neg)
df_neg.head()

Unnamed: 0,text
0,@VirginAmerica Is it me or is your website down? BTW your new website isn't a great user experience. Time for another redesign.\n
1,@SouthwestAir wifi on my plane but I gotta pay for it? Help your broke homegirl out✈️📱\n
2,@SouthwestAir my flight is Cancelled Flightled due to weather. What next ?\n
3,@USAirways how about an update? 2hrs delayed this obviously isnt up 2 date havent boarded http://t.co/L7lWjaZiOA\n
4,@AmericanAir\n Your response could have made all the difference. It could have made the situation better. NO TRUST...GET LOST like my ba...


In [306]:
#add sentiment to both datasets and then combine them for test data 1 for positive and 0 for negative
df_pos['Sentiment']=1
df_neg['Sentiment']=0
frames = [df_pos, df_neg]
df = pd.concat(frames)

In [310]:
df.shape

(4181, 2)

## Create Vocabulary - No Stemming

In [309]:
# increase column width to see more of the tweets
pd.set_option('max_colwidth', 140)

# reshuffle the tweets to see both pos and neg in random order
df = df.sample(frac=1).reset_index(drop=True)

# explore top 20 rows
df.head(20)

Unnamed: 0,text,Sentiment
0,@USAirways call dropped &amp; no call back...another 45min-hour for another rep. The worst CS ever online by phone &amp; in person.\n,0
1,@VirginAmerica got it squared away. Someone picked up as soon as I tweeted. Should have tweeted sooner. 😉\n,1
2,@JetBlue but thank you! Love an anxious flyer.\n,1
3,:( RT @JetBlue: Our fleet's on fleek. http://t.co/ncguZDgDaQ\n,0
4,@USAirways @jhughes1025 grrrrrrrr. Couldn't book a flight via calling either :(\n,0
5,@JetBlue when your flights delayed :)))&gt;&gt;&gt;\n,0
6,@AmericanAir has the worst flight change policy. No mercy no sympathy such a bummer when you can't go to funerals or see friends bc of i...,0
7,@SouthwestAir because according to the flight attendant my husband doesn't talk english when the fact is that he understand and talks it\n,0
8,@united had been suffering immensely ever since this merge .. Hopefully I'll never have to fly @united ever again ..\n,0
9,@USAirways why aren't you updating flight status/delays?\n,0


In [311]:
# Remove any markup tags (HTML)
def cleanhtml(raw_html):
    pattern = re.compile('<.*?>')
    cleantext = re.sub(pattern, '', raw_html)
    return cleantext

In [312]:
def removeat(text):
    atlist=[]
    for word in text:
        pattern = re.compile('^@')
        if re.match(pattern,word):
            #cleantext1 = re.sub(pattern, word[1:], word)
            atlist.append(word[1:])
        else:
            atlist.append(word)
    return atlist

In [313]:
def tolower(text):
    lowerlist=[]
    for word in text:
        pattern = re.compile('[A-Z][a-z]+')
        if re.match(pattern,word):
            cleantext1 = re.sub(pattern, word.lower(), word)
            lowerlist.append(cleantext1)
        else:
            lowerlist.append(word)
    return lowerlist

In [314]:
cleantweet= []
for doc in df.text:
    cleantweet.append(cleanhtml(doc))


tokentweet=[]
df.text= cleantweet
for doc in df.text:
    tokentweet.append(TweetTokenizer().tokenize(doc))
    
df.text= tokentweet

In [315]:
removeattweet=[]
for doc in df.text:
    removeattweet.append(removeat(doc))
df.text =removeattweet

In [316]:
removeattweet[1]

['VirginAmerica',
 'got',
 'it',
 'squared',
 'away',
 '.',
 'Someone',
 'picked',
 'up',
 'as',
 'soon',
 'as',
 'I',
 'tweeted',
 '.',
 'Should',
 'have',
 'tweeted',
 'sooner',
 '.',
 '😉']

In [317]:
lowertweet=[]
for doc in df.text:
    lowertweet.append(tolower(doc))
df.text = lowertweet

In [318]:
lowertweet[1]

['virginamericavirginamerica',
 'got',
 'it',
 'squared',
 'away',
 '.',
 'someone',
 'picked',
 'up',
 'as',
 'soon',
 'as',
 'I',
 'tweeted',
 '.',
 'should',
 'have',
 'tweeted',
 'sooner',
 '.',
 '😉']

In [319]:
tweets=[]
for x in df.text:
    tweet = ''
    for word in x:
        tweet += word+' '
    tweets.append(word_tokenize(tweet))
df.text= tweets

In [321]:
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,text,Sentiment
0,"[united, agent, split, up, my, reservation, ?, now, ca, n't, cancelled, flight, and, refund, credit, for, 2wks, ?, why]",0
1,"[jetbluejetblue, flight, attendant, wendi, on, flt, 127, on, 2/17, newark, to, orlando, ., 👍, 👍]",1
2,"[americanairamericanair, oh, no, she, left, the, counter, and, went, downstairs, ., then, she, told, elise, an, agent, that, joined, her...",0
3,"[americanairamericanair, cancelled, flights, my, flight, does, n't, send, an, email, text, or, call, ., then, puts, me, on, way, earlier...",0
4,"[USAirways, #, DividendRewards, urgently, need, to, speak, with, a, CS, rep, ., my, 50K, miles, havent, shown, &, I, 've, fulfilled, all...",0
...,...,...
4176,"[southwestairsouthwestair, can, you, please, DM, me, who, I, can, speak, with, regarding, my, receipts, and, who, I, can, email, a, pres...",0
4177,"[americanairamericanair, great, thanks, !]",1
4178,"[USAirways, call, dropped, &, no, call, back, ..., another, 45min, -, hour, for, another, rep, ., the, worst, CS, ever, online, by, phon...",0
4179,"[united, thank, you, for, getting, our, daughter, home, when, americanair, cancelled, flightled, all, their, flights, to, nashville]",1
