In [176]:
import pandas as pd

test_df = pd.read_csv('datasets/twitter-sentiment/test.csv', sep=',', encoding='latin1')
train_df = pd.read_csv('datasets/twitter-sentiment/train.csv', sep=',', encoding='latin1')

train_df.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [177]:
test_label = test_df.ItemID

In [156]:
train_df = train_df[['Sentiment', 'SentimentText']]
test_df = test_df[['SentimentText']]

In [157]:
# Clean chars
import re

def normal(row):
    return re.sub('[^a-zA-Z ]', '', row['SentimentText'])

train_df['SentimentText'] = train_df.apply(normal, axis=1)
test_df['SentimentText'] = test_df.apply(normal, axis=1)
train_df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL friend
1,0,I missed the New Moon trailer
2,1,omg its already O
3,0,Omgaga Im sooo im gunna CRy Ive be...
4,0,i think mi bf is cheating on me TT


In [162]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
norm = train_df['SentimentText']
cv.fit(norm)
norm2 = test_df['SentimentText']

X = cv.transform(norm)
y = train_df['Sentiment']
test_X = cv.transform(norm2)
test_X.shape

(299989, 113701)

In [165]:
train_df['Sentiment'].unique()

array([0, 1])

In [166]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [167]:
from keras import models
from keras.layers import Dense, Dropout

model = models.Sequential()
model.add(Dense(300, activation='relu', input_shape=(113701,)))
model.add(Dropout(0.2))
model.add(Dense(300, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 300)               34110600  
_________________________________________________________________
dropout_15 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 300)               90300     
_________________________________________________________________
dropout_16 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 301       
Total params: 34,201,201
Trainable params: 34,201,201
Non-trainable params: 0
_________________________________________________________________


In [168]:
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

In [170]:
results = model.fit(
    X_train, y_train,
    epochs = 1,
    batch_size = 300,
    validation_data=(X_test, y_test)
)

Train on 74991 samples, validate on 24998 samples
Epoch 1/1


In [171]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.4791162698061354
Test accuracy: 0.7663813105096091


In [172]:
predicate = model.predict(test_X)

In [173]:
res = pd.DataFrame(predicate).applymap(round)

In [174]:
res.shape

(299989, 1)

In [178]:
test_label.shape

(299989,)

In [179]:
import numpy as np
submission = pd.DataFrame({
        "ItemID": np.ravel(test_label.values),
        "Sentiment": np.ravel(res)
    })
submission.head()

Unnamed: 0,PassengerId,Sentiment
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [180]:
submission.to_csv("TwitterResult.csv", index=False)