## Bring in the txt files into Pandas dataframe

In [1]:
# import dependencies
import pandas as pd
import os
from collections import defaultdict
from pathlib import Path
import nltk as nl
from nltk.tokenize import word_tokenize
import re
from nltk.tokenize.casual import TweetTokenizer

In [2]:
nl.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Hari
[nltk_data]     Ravella\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
os.chdir("D:\\Spring 2020\\assignments\\sentiment_classification\\tweet")

In [4]:
#set directory path
my_dir_path = "train\\positive"

In [5]:
# create list to store text
results = defaultdict(list)

In [6]:
# loop through files and append text to list
for file in Path(my_dir_path).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results["text"].append(file_open.read())

In [7]:
# read the list in as a dataframe
df_pos = pd.DataFrame(results)

In [8]:
# take a look at dataframe
df_pos.head()

Unnamed: 0,text
0,@SouthwestAir I would appreciate that. Thank ...
1,@USAirways thank you very much.\n
2,@JetBlue I'm all set. About to fly. Not bad fo...
3,@SouthwestAir I got a flight at 11:55am on Thu...
4,@AmericanAir you're my early frontrunner for b...


In [9]:
#set directory path
my_dir_path_neg = "train\\negative"

# create list to store text
results_neg = defaultdict(list)

## loop through files and append text to list
for file in Path(my_dir_path_neg).iterdir():
    with open(file, "r", encoding="utf8") as file_open:
        results_neg["text"].append(file_open.read())
        
# read the list in as a dataframe
df_neg = pd.DataFrame(results_neg)
df_neg.head()

Unnamed: 0,text
0,@united Really....you charge me $25 to check a...
1,.@JetBlue thanks for making an effort. Credit ...
2,@united plz don't advertise wifi if it's not g...
3,@SouthwestAir - 800 is not int'l friendly\n
4,@USAirways thanks for a subpar travel experien...


In [10]:
#add sentiment to both datasets and then combine them for test data 1 for positive and 0 for negative
df_pos['Sentiment']=1
df_neg['Sentiment']=0
frames = [df_pos, df_neg]
df = pd.concat(frames)

In [11]:
df.shape

(4181, 2)

## Create Vocabulary - No Stemming

In [12]:
# increase column width to see more of the tweets
pd.set_option('max_colwidth', 140)

# reshuffle the tweets to see both pos and neg in random order
df = df.sample(frac=1).reset_index(drop=True)

# explore top 20 rows
#df.head(20)

In [13]:
# Remove any markup tags (HTML), all the mentions of handles(starts with '@') and '#' character
def cleantweettext(raw_html):
    pattern = re.compile('<.*?>')
    cleantext = re.sub(pattern, '', raw_html)
    cleantext = " ".join(filter(lambda x:x[0]!='@', cleantext.split()))
    cleantext = cleantext.replace('#', '')
    return cleantext

In [14]:
cleantweet= []
for doc in df.text:
    cleantweet.append(cleantweettext(doc))
df.text= cleantweet


In [15]:
#Tokenize using emoji tokenizer
tokentweet=[]
for doc in df.text:
    tokentweet.append(TweetTokenizer().tokenize(doc))
    
df.text= tokentweet

In [16]:
df.text[:5]

0                      [On, hold, for, 45, minutes, trying, to, rebook, a, Cancelled, Flightled, flight, with, a, companion, ticket, ., Help, ?]
1                                                                [Got, help, from, a, nice, lady, on, the, phone, in, Georgia, ., Thank, you, !]
2                                                                                 [coming, up, on, 2hrs, on, still, haven't, spoken, to, a, rep]
3    [attendants, at, the, gate, ., Enjoyed, watching, the, other, flight, to, our, destination, take, off, before, having, a, horrible, atte...
4                       [-, had, to, call, to, find, out, I, was, re-booked, tomorrow, why, ask, for, my, number, an, email, for, day, of, info]
Name: text, dtype: object

In [17]:
def tolower(text):
    lowerlist=[]
    for word in text:
        pattern = re.compile('[A-Z][a-z]+')
        if re.match(pattern,word):
            cleantext1 = re.sub(pattern, word.lower(), word)
            lowerlist.append(cleantext1)
        else:
            lowerlist.append(word)
    return lowerlist

In [18]:
lowertweet=[]
for doc in df.text:
    lowertweet.append(tolower(doc))
df.text = lowertweet

In [19]:
df.text[:5]

0                      [on, hold, for, 45, minutes, trying, to, rebook, a, cancelled, flightled, flight, with, a, companion, ticket, ., help, ?]
1                                                                [got, help, from, a, nice, lady, on, the, phone, in, georgia, ., thank, you, !]
2                                                                                 [coming, up, on, 2hrs, on, still, haven't, spoken, to, a, rep]
3    [attendants, at, the, gate, ., enjoyed, watching, the, other, flight, to, our, destination, take, off, before, having, a, horrible, atte...
4                       [-, had, to, call, to, find, out, I, was, re-booked, tomorrow, why, ask, for, my, number, an, email, for, day, of, info]
Name: text, dtype: object

In [20]:
#Combining emoji tokenizer and the word tokenizer to handle tokenization at aphostophes
tweets=[]
for x in df.text:
    tweet = ''
    for word in x:
        tweet += word+' '
    tweets.append(word_tokenize(tweet))
df.text= tweets

In [21]:
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,text,Sentiment
0,"[just, sent, you, a, message, on, facebook, how, do, I, follow, up, a, complaint, re, ., missing, clothing, out, of, checked, baggage, ?]",0
1,"[you, service, agents, at, MCO, are, great, but, their, are, not, enough, of, them, working, right, now, !]",0
2,"[no, worries, your, customer, service, gets, a, bad, wrap, but, just, spoke, w, agent, who, saved, me, huge, amounts, of, time, &, apolo...",1
3,"[FYI, your, customer, service, rep, carol, is, an, absolute, delight, ., so, pleasant, to, with, with, and, rebooked, me, in, lightning,...",1
4,"[I, need, a, flight, out, tonight, ., isn't't, there, anything, else, ?]",0
5,"[I, was, happy, to, purchase, the, upgrade, ., if, only, it, was, avail, on, my, next, flight, .]",0
6,"[I, just, received, notification, of, in-flight, wi-fi-wi-fi, for, UA863, from, to, amazing, !]",1
7,"[where, are, your, tickets, offices, in, boston, ?, impossible, to, book, by, phone, or, use, vouchers, on, website, what, a, headache, ...",0
8,"[``, that's, 's, unusual, ``, means, we, screwed, up, but, will, never, admit, to, it, !, goingforgreatnessfailgoingforgreatnessfailgoin...",0
9,"[I, look, forward, to, those, direct, flights, to, california, to, see, my, family, more, often, ., thank, you, byebyeusairline]",1
