In [78]:
import pandas as pd
df = pd.read_csv('datasets/Tweets.csv', header=0, sep=',')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [79]:
df = df[['airline_sentiment', 'text']]
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [80]:
import re

def norm1(row):
    return re.sub('[@`\'()\"#/?!.,+=-\=\+-<>()]', '', row['text'])

df['norm'] = df.apply(norm1, axis=1)
df.drop(['text'], inplace=True, axis=1)
df.head()

Unnamed: 0,airline_sentiment,norm
0,neutral,VirginAmerica What dhepburn said
1,positive,VirginAmerica plus youve added commercials to ...
2,neutral,VirginAmerica I didnt today Must mean I need t...
3,negative,VirginAmerica its really aggressive to blast o...
4,negative,VirginAmerica and its a really big bad thing a...


In [81]:
sentiment_map = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}
df['sentiment'] = df['airline_sentiment'].map(sentiment_map)
df.drop(columns=['airline_sentiment'], inplace=True)
df.head()

Unnamed: 0,norm,sentiment
0,VirginAmerica What dhepburn said,0
1,VirginAmerica plus youve added commercials to ...,1
2,VirginAmerica I didnt today Must mean I need t...,0
3,VirginAmerica its really aggressive to blast o...,-1
4,VirginAmerica and its a really big bad thing a...,-1


In [82]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
norm = df['norm']
cv.fit(norm)
X = cv.transform(norm)
y = df[['sentiment']]
X.shape

(14640, 14502)

In [90]:
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)


num_classes = 3
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_train

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [85]:
from keras import models
from keras.layers import Dense, Dropout

model = models.Sequential()
model.add(Dense(100, activation='relu', input_shape=(14502,)))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               1450300   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 303       
Total params: 1,460,703
Trainable params: 1,460,703
Non-trainable params: 0
_________________________________________________________________


In [86]:
model.compile(
    optimizer = "adam",
    loss = "binary_crossentropy",
    metrics = ["accuracy"]
)

W0728 16:56:09.692133 139812312352384 deprecation_wrapper.py:119] From /home/tumnus/venv/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0728 16:56:09.711279 139812312352384 deprecation_wrapper.py:119] From /home/tumnus/venv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0728 16:56:09.715909 139812312352384 deprecation.py:323] From /home/tumnus/venv/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [87]:
results = model.fit(
  X_train, y_train,
  epochs = 2,
  batch_size = 128,
  validation_data = (X_test, y_test)
)

Train on 10980 samples, validate on 3660 samples
Epoch 1/2
Epoch 2/2


In [88]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.30098136829548194
Test accuracy: 0.8765938158895149
