In [74]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords
import string
import numpy as np
import pandas as pd
import nltk
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.svm import SVC

In [2]:
training_data = pd.read_csv('0000000000002747_training_twitter_x_y_train.csv')
testing_data = pd.read_csv('0000000000002747_test_twitter_x_test.csv')

In [4]:
training_data.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [5]:
training_data.describe(include="all")

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
count,10980.0,10980,10980,31,10980,24,10980.0,10980,776,10980,7430,7403
unique,,3,6,3,6438,11,,10851,632,10758,2658,78
top,,negative,United,negative,JetBlueNews,Customer Service Issue,,@united thanks,"[0.0, 0.0]",2015-02-23 06:57:24 -0800,"New York, NY",Eastern Time (US & Canada)
freq,,6851,2928,24,43,9,,6,131,3,125,2819
mean,5.692169e+17,,,,,,0.080965,,,,,
std,779543800000000.0,,,,,,0.740303,,,,,
min,5.675883e+17,,,,,,0.0,,,,,
25%,5.685584e+17,,,,,,0.0,,,,,
50%,5.694753e+17,,,,,,0.0,,,,,
75%,5.698902e+17,,,,,,0.0,,,,,


In [36]:
tweets = training_data["text"].values
sentiments = training_data["airline_sentiment"].values

In [37]:
print(len(tweets))
print(len(sentiments))

10980
10980


In [38]:
train_documents = []
for i in range(len(tweets)):
    train_documents.append((word_tokenize(tweets[i]),sentiments[i]))

In [39]:
print(train_documents[0])

(['@', 'SouthwestAir', 'I', 'am', 'scheduled', 'for', 'the', 'morning', ',', '2', 'days', 'after', 'the', 'fact', ',', 'yes..not', 'sure', 'why', 'my', 'evening', 'flight', 'was', 'the', 'only', 'one', 'Cancelled', 'Flightled'], 'negative')


In [40]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [41]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [42]:
lemmatizer = WordNetLemmatizer()

In [43]:
def clean_tweet(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [44]:
train_documents = [(clean_tweet(tweet), sentiment) for tweet, sentiment in train_documents]

In [49]:
print(train_documents[0])

(['southwestair', 'schedule', 'morning', '2', 'day', 'fact', 'yes..not', 'sure', 'even', 'flight', 'one', 'cancelled', 'flightled'], 'negative')


In [50]:
sentiments = [sentiment for tweet, sentiment in train_documents]

In [51]:
tweet_documents = [" ".join(tweet) for tweet, sentiment in train_documents]

## Testing Data Cleaning

In [53]:
testing_data.head(5)

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


In [69]:
testing_data.describe(include="all")

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
count,3660.0,3660,9,3660,8,3660.0,3660,243,3660,2477,2417
unique,,6,2,2805,6,,3650,209,3635,1258,59
top,,United,negative,JetBlueNews,Customer Service Issue,,@AmericanAir I purchased Main Cabin XT for f-1...,"[0.0, 0.0]",2015-02-23 15:25:46 -0800,USA,Eastern Time (US & Canada)
freq,,894,8,20,3,,2,33,3,43,925
mean,5.692226e+17,,,,,0.087705,,,,,
std,777903000000000.0,,,,,0.762048,,,,,
min,5.675924e+17,,,,,0.0,,,,,
25%,5.685633e+17,,,,,0.0,,,,,
50%,5.694842e+17,,,,,0.0,,,,,
75%,5.698927e+17,,,,,0.0,,,,,


In [55]:
tweets = testing_data["text"].values

In [57]:
print(len(tweets))

3660


In [62]:
test_documents = []
for i in range(len(tweets)):
    test_documents.append(word_tokenize(tweets[i]))

In [63]:
print(test_documents[0])

['@', 'AmericanAir', 'In', 'car', 'gng', 'to', 'DFW', '.', 'Pulled', 'over', '1hr', 'ago', '-', 'very', 'icy', 'roads', '.', 'On-hold', 'with', 'AA', 'since', '1hr', '.', 'Ca', "n't", 'reach', 'arpt', 'for', 'AA2450', '.', 'Wat', '2', 'do', '?']


In [64]:
test_documents = [clean_tweet(tweet) for tweet in test_documents]

In [65]:
print(test_documents[0])

['americanair', 'car', 'gng', 'dfw', 'pulled', '1hr', 'ago', 'icy', 'road', 'on-hold', 'aa', 'since', '1hr', 'ca', "n't", 'reach', 'arpt', 'aa2450', 'wat', '2']


In [66]:
test_tweet_documents = [" ".join(tweet) for tweet in test_documents]

In [67]:
print(test_tweet_documents[0])

americanair car gng dfw pulled 1hr ago icy road on-hold aa since 1hr ca n't reach arpt aa2450 wat 2


In [68]:
count_vect = TfidfVectorizer(max_features=5000, max_df=0.8, min_df=0.001)

In [70]:
x_train = count_vect.fit_transform(tweet_documents)
y_train = sentiments

x_test=count_vect.transform(test_tweet_documents)

In [72]:
count_vect.get_feature_names()

['000',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1hr',
 '1k',
 '1st',
 '20',
 '200',
 '2015',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '2hrs',
 '2nd',
 '30',
 '35',
 '36',
 '3rd',
 '40',
 '400',
 '45',
 '50',
 '500',
 '60',
 '70',
 '700',
 '728',
 '75',
 '800',
 '90',
 'aa',
 'able',
 'absolute',
 'absolutely',
 'accept',
 'acceptable',
 'access',
 'accommodate',
 'account',
 'act',
 'actual',
 'actually',
 'add',
 'additional',
 'address',
 'advantage',
 'advise',
 'advisory',
 'afternoon',
 'age',
 'agent',
 'ago',
 'ah',
 'ahead',
 'air',
 'aircraft',
 'airline',
 'airlines',
 'airplane',
 'airport',
 'airway',
 'airways',
 'alert',
 'all',
 'allow',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'alternate',
 'although',
 'always',
 'amaze',
 'amazing',
 'america',
 'american',
 'americanair',
 'americanairlines',
 'amount',
 'amp',
 'angry',
 'announce',
 'announcement',
 'another',
 'answer',
 'anymore',
 'anyone',
 'anything'

In [75]:
svc = SVC()
svc.fit(x_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [76]:
y_test = svc.predict(x_test)

In [77]:
y_test

array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype='<U8')

In [79]:
svc.score(x_test, y_test)

1.0

In [80]:
np.savetxt("prediction.csv", y_test, fmt='%s')