# Sentiment Analysis
## The FAST AI way

In [4]:
from fastai.text import *

In [6]:
path_to_file = "twitter-airline-sentiment/"
file_name = "Tweets.csv"

In [21]:
# Read csv
raw_data = pd.read_csv(path_to_file + file_name)

In [22]:
raw_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [23]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
tweet_id                        14640 non-null int64
airline_sentiment               14640 non-null object
airline_sentiment_confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason_confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 non-null object
tweet_location                  9907 non-null object
user_timezone                   9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB


In [24]:
# the goal if the challenge is to find classify a tweet sentiment based on text
# so we can discard the rest of the data
filtered_data  = raw_data[['airline_sentiment', 'text']]
filtered_data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [25]:
# convert sentiment text into numeric
def s_t_n(sentiment):
    if sentiment == "negative":
        return -1
    elif sentiment == "positive":
        return 1
    else:
        return 0

filtered_data['labels'] = filtered_data['airline_sentiment'].apply(lambda x: s_t_n(x))
filtered_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,airline_sentiment,text,labels
0,neutral,@VirginAmerica What @dhepburn said.,0
1,positive,@VirginAmerica plus you've added commercials t...,1
2,neutral,@VirginAmerica I didn't today... Must mean I n...,0
3,negative,@VirginAmerica it's really aggressive to blast...,-1
4,negative,@VirginAmerica and it's a really big bad thing...,-1


In [26]:
del filtered_data['airline_sentiment']

In [60]:
# save the filted data
filtered_data.to_csv("/".join([path_to_file, "filtered.csv"]) ,header=False, index=False)

In [56]:
print('/'.join([path_to_file, "n"]))

twitter-airline-sentiment/Tweets.csv/n


## Language Model Tokens

In [7]:
BOS = 'xbos' # beginning of sentence tag
FLD = 'xfld' # data field tag

In [40]:
filtered_tweets = pd.read_csv(path_to_file + "filtered.csv", header=None)

In [145]:
# plagerising from Jeremy my fixup replaces imoticons with something else
re1 = re.compile(r'  +')
def fixup(x):
    x = x.replace("@", " ").replace("http", "urlink")
    return re1.sub(' ', html.unescape(x))

In [146]:
def get_tokens(df):
    labels = df.iloc[:,1].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + "\n\n".join(df[0].astype(str))
    texts = fixup(texts)
    
    tok = Tokenizer().process_all(partition_by_cores(texts, 4))
    return tok, list(labels)

In [159]:
tok, labels = get_tokens(filtered_tweets)

In [160]:
' '.join(tok[0])



In [161]:
np.save(path_to_file + 'lang_tokens.npy', tok)

In [162]:
tok = np.load(path_to_file + "lang_tokens.npy")

In [163]:
freq = Counter(p for o in tok for p in o)
freq.most_common(25)

[('xxmaj', 23280),
 ('\n \n ', 14674),
 ('.', 13809),
 ('to', 8648),
 ('xxup', 7085),
 ('i', 6759),
 ('the', 6059),
 ('!', 4847),
 ('?', 4512),
 ('a', 4507),
 ('/', 4427),
 ('you', 4404),
 ('united', 4161),
 (',', 4146),
 ('for', 3996),
 ('flight', 3932),
 ('on', 3811),
 ('and', 3730),
 ('#', 3651),
 ('my', 3287),
 ('usairways', 3049),
 ('americanair', 2964),
 ('is', 2933),
 ('in', 2579),
 ('southwestair', 2457)]

In [164]:
max_vocab = 60000
min_freq = 2

In [166]:
itos = [o for o, c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_unk_')
itos.insert(1, '_pad_')

In [167]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

4693

In [169]:
tok_lm = np.array([[stoi[o] for o in p] for p in tok])

In [170]:
' '.join(str(o) for o in tok_lm)

'[0, 0, 0, 164, 98, 2, 66, 0, 247, 4, 3, 98, 575, 13, 112, 1175, 2528, 5, 8, 222, 62, 0, 4, 3, 98, 7, 96, 30, 117, 62, 2, 802, 591, 7, 94, 5, 176, 169, 214, 9, 3, 98, 27, 41, 157, 3757, 5, 0, 0, 60, 1023, 60, 25, 35, 3173, 312, 3758, 78, 69, 33, 516, 2781, 3, 98, 19, 27, 41, 11, 157, 490, 236, 498, 102, 27, 3, 98, 457, 86, 304, 119, 315, 11, 17, 16, 218, 34, 96, 30, 33, 44, 2114, 4, 305, 27, 41, 157, 8, 138, 236, 498, 102, 147, 6, 1977, 3, 98, 190, 15, 1504, 334, 67, 7, 127, 6, 2299, 44, 228, 0, 0, 231, 241, 649, 151, 480, 295, 3, 98, 2, 157, 306, 11, 3174, 1767, 16, 2, 3759, 2, 361, 2, 3760, 0, 15, 83, 4, 592, 32, 12, 12, 48, 12, 0, 3, 98, 2, 257, 15, 7, 0, 714, 47, 6, 56, 7, 6, 43, 9, 6, 0, 3, 98, 27, 36, 380, 15, 19, 593, 57, 111, 376, 4, 2, 13, 154, 198, 158, 5, 31, 4, 3, 98, 96, 13, 134, 34, 2529, 24, 8, 635, 2115, 803, 29, 1505, 1768, 2530, 289, 39, 454, 3, 98, 7, 1435, 715, 0, 4, 61, 203, 239, 185, 0, 0, 4, 6, 2116, 3, 98, 2, 44, 24, 594, 11, 139, 674, 9, 2, 243, 1506, 102, 21, 

In [172]:
np.save(path_to_file + 'tok_ids.npy', tok)
pickle.dump(itos, open(path_to_file + 'itos.pkl', 'wb'))

In [173]:
tok_lm = np.load(path_to_file + 'tok_ids.npy')
itos = pickle.load(open(path_to_file + 'itos.pkl', 'rb'))

In [174]:
vs=len(itos)
vs,len(tok_lm)

(4693, 4)

In [179]:
tok_lm[0]

[' \n ',
 'xbos',
 'xfld',
 '1',
 'virginamerica',
 'xxmaj',
 'what',
 'dhepburn',
 'said',
 '.',
 '\n \n ',
 'virginamerica',
 'plus',
 'you',
 "'ve",
 'added',
 'commercials',
 'to',
 'the',
 'experience',
 '...',
 'tacky',
 '.',
 '\n \n ',
 'virginamerica',
 'i',
 'did',
 "n't",
 'today',
 '...',
 'xxmaj',
 'must',
 'mean',
 'i',
 'need',
 'to',
 'take',
 'another',
 'trip',
 '!',
 '\n \n ',
 'virginamerica',
 'it',
 "'s",
 'really',
 'aggressive',
 'to',
 'blast',
 'obnoxious',
 '"',
 'entertainment',
 '"',
 'in',
 'your',
 'guests',
 "'",
 'faces',
 '&',
 'they',
 'have',
 'little',
 'recourse',
 '\n \n ',
 'virginamerica',
 'and',
 'it',
 "'s",
 'a',
 'really',
 'big',
 'bad',
 'thing',
 'about',
 'it',
 '\n \n ',
 'virginamerica',
 'seriously',
 'would',
 'pay',
 '$',
 '30',
 'a',
 'flight',
 'for',
 'seats',
 'that',
 'did',
 "n't",
 'have',
 'this',
 'playing',
 '.',
 '\n ',
 'it',
 "'s",
 'really',
 'the',
 'only',
 'bad',
 'thing',
 'about',
 'flying',
 'xxup',
 'va',
 '\n \