In [144]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords
import string
import numpy as np
import pandas as pd
import nltk
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.svm import SVC

In [145]:
training_data = pd.read_csv('0000000000002747_training_twitter_x_y_train.csv')
testing_data = pd.read_csv('0000000000002747_test_twitter_x_test.csv')

In [146]:
training_data.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [147]:
training_data.describe(include="all")

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
count,10980.0,10980,10980,31,10980,24,10980.0,10980,776,10980,7430,7403
unique,,3,6,3,6438,11,,10851,632,10758,2658,78
top,,negative,United,negative,JetBlueNews,Customer Service Issue,,@united thanks,"[0.0, 0.0]",2015-02-23 06:57:24 -0800,"New York, NY",Eastern Time (US & Canada)
freq,,6851,2928,24,43,9,,6,131,3,125,2819
mean,5.692169e+17,,,,,,0.080965,,,,,
std,779543800000000.0,,,,,,0.740303,,,,,
min,5.675883e+17,,,,,,0.0,,,,,
25%,5.685584e+17,,,,,,0.0,,,,,
50%,5.694753e+17,,,,,,0.0,,,,,
75%,5.698902e+17,,,,,,0.0,,,,,


In [148]:
train_tweets = training_data["text"].values
sentiments = training_data["airline_sentiment"].values

In [149]:
print(len(train_tweets))
print(len(sentiments))

10980
10980


In [150]:
train_documents = []
for i in range(len(train_tweets)):
    train_documents.append((word_tokenize(train_tweets[i]),sentiments[i]))

In [151]:
print(train_documents[0])
print(len(train_documents))

(['@', 'SouthwestAir', 'I', 'am', 'scheduled', 'for', 'the', 'morning', ',', '2', 'days', 'after', 'the', 'fact', ',', 'yes..not', 'sure', 'why', 'my', 'evening', 'flight', 'was', 'the', 'only', 'one', 'Cancelled', 'Flightled'], 'negative')
10980


In [152]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [153]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [154]:
lemmatizer = WordNetLemmatizer()

In [155]:
def clean_tweet(words):
    output_words = []
    for w in words:
        if w.lower() not in stops and w.lower().isalpha():
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [156]:
train_documents = [(clean_tweet(tweet), sentiment) for tweet, sentiment in train_documents]

In [157]:
print(train_documents[0])
print(len(train_documents))

(['southwestair', 'schedule', 'morning', 'day', 'fact', 'sure', 'even', 'flight', 'one', 'cancelled', 'flightled'], 'negative')
10980


In [158]:
sentiments = [sentiment for tweet, sentiment in train_documents]

In [159]:
train_tweet_documents = [" ".join(tweet) for tweet, sentiment in train_documents]

In [160]:
print(train_tweet_documents[0])
print(len(train_documents))

southwestair schedule morning day fact sure even flight one cancelled flightled
10980


## Testing Data Cleaning

In [161]:
testing_data.head(5)

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


In [162]:
testing_data.describe(include="all")

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
count,3660.0,3660,9,3660,8,3660.0,3660,243,3660,2477,2417
unique,,6,2,2805,6,,3650,209,3635,1258,59
top,,United,negative,JetBlueNews,Customer Service Issue,,@AmericanAir I purchased Main Cabin XT for f-1...,"[0.0, 0.0]",2015-02-23 15:25:46 -0800,USA,Eastern Time (US & Canada)
freq,,894,8,20,3,,2,33,3,43,925
mean,5.692226e+17,,,,,0.087705,,,,,
std,777903000000000.0,,,,,0.762048,,,,,
min,5.675924e+17,,,,,0.0,,,,,
25%,5.685633e+17,,,,,0.0,,,,,
50%,5.694842e+17,,,,,0.0,,,,,
75%,5.698927e+17,,,,,0.0,,,,,


In [163]:
test_tweets = testing_data["text"].values

In [164]:
print(len(test_tweets))

3660


In [165]:
test_documents = []
for i in range(len(test_tweets)):
    test_documents.append(word_tokenize(test_tweets[i]))

In [167]:
print(test_documents[0])
print(len(test_documents))

['@', 'AmericanAir', 'In', 'car', 'gng', 'to', 'DFW', '.', 'Pulled', 'over', '1hr', 'ago', '-', 'very', 'icy', 'roads', '.', 'On-hold', 'with', 'AA', 'since', '1hr', '.', 'Ca', "n't", 'reach', 'arpt', 'for', 'AA2450', '.', 'Wat', '2', 'do', '?']
3660


In [168]:
test_documents = [clean_tweet(tweet) for tweet in test_documents]

In [169]:
print(test_documents[0])
print(len(test_documents))

['americanair', 'car', 'gng', 'dfw', 'pulled', 'ago', 'icy', 'road', 'aa', 'since', 'ca', 'reach', 'arpt', 'wat']
3660


In [170]:
test_tweet_documents = [" ".join(tweet) for tweet in test_documents]

In [171]:
print(test_tweet_documents[0])
print(len(test_documents))

americanair car gng dfw pulled ago icy road aa since ca reach arpt wat
3660


In [302]:
count_vect = CountVectorizer(max_features=1100)

In [303]:
x_train = count_vect.fit_transform(train_tweet_documents)
y_train = sentiments

x_test = count_vect.transform(test_tweet_documents)

In [304]:
print(len(train_tweet_documents))
print(len(test_tweet_documents))

10980
3660


In [305]:
clf = RandomForestClassifier(n_estimators=2000, n_jobs=-1)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [306]:
y_test = clf.predict(x_test)

In [307]:
y_test

array(['negative', 'negative', 'negative', ..., 'neutral', 'positive',
       'negative'], dtype='<U8')

In [308]:
clf.score(x_train, y_train)

0.9908925318761385

In [309]:
np.savetxt("prediction.csv", y_test, fmt='%s')