In [63]:
import numpy as np
import pandas as pd
import re
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [64]:
cols = ['OriginalTweet', 'Sentiment']

In [65]:
df_train = pd.read_csv('data/Corona_NLP_train.csv', encoding='latin1', usecols=cols)

In [66]:
df_train.head(3)

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive


In [67]:
df_train['Sentiment'].unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [68]:
#encode the target variable
df_train['Sentiment'] = df_train['Sentiment'].replace({'Extremely Negative':0, 'Negative':1, 'Neutral':2, 'Positive':3, 'Extremely Positive':4})
df_train.head(3)

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,2
1,advice Talk to your neighbours family to excha...,3
2,Coronavirus Australia: Woolworths to give elde...,3


In [69]:
#preprocessing
#get rid of links
df_train['OriginalTweet'] = df_train['OriginalTweet'].apply(lambda x: re.sub(r'http\S+', '', x))
#remove special characters
df_train['OriginalTweet'] = df_train['OriginalTweet'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
#remove numbers
df_train['OriginalTweet'] = df_train['OriginalTweet'].apply(lambda x: re.sub(r'\d+', '', x))
#remove stopwords
stop = stopwords.words('english')
df_train['OriginalTweet'] = df_train['OriginalTweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#lemmatization
lemmatizer = WordNetLemmatizer()
df_train['OriginalTweet'] = df_train['OriginalTweet'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df_train.head(3)

Unnamed: 0,OriginalTweet,Sentiment
0,MeNyrbie PhilGahan Chrisitv,2
1,advice Talk neighbour family exchange phone nu...,3
2,Coronavirus Australia Woolworths give elderly ...,3


In [70]:
df_train.loc[2, 'OriginalTweet']

'Coronavirus Australia Woolworths give elderly disabled dedicated shopping hour amid COVID outbreak'

In [71]:
vocab_size = 50000
encoded_docs = [one_hot(d, vocab_size) for d in df_train['OriginalTweet']]

In [72]:
encoded_docs[0]

[28769, 33453, 7569]

In [73]:
#input length is the max length of the sentence
max_length = df_train['OriginalTweet'].str.len().max()
embeded_vecotr_size = 32
model = Sequential()
model.add(Embedding(vocab_size, embeded_vecotr_size, input_length=max_length))
model.add(Flatten())
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 286, 32)           1600000   
                                                                 
 flatten_7 (Flatten)         (None, 9152)              0         
                                                                 
 dense_19 (Dense)            (None, 5)                 45765     
                                                                 
Total params: 1,645,765
Trainable params: 1,645,765
Non-trainable params: 0
_________________________________________________________________


In [74]:
#pad the sentences to the max length
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[28769 33453  7569 ...     0     0     0]
 [20884 47202 43399 ...     0     0     0]
 [19764 37775 11539 ...     0     0     0]
 ...
 [35025  7451  4617 ...     0     0     0]
 [46077 37302   623 ...     0     0     0]
 [43573  5893 38660 ...     0     0     0]]


In [75]:
X = padded_docs
y = df_train['Sentiment']

In [76]:
#split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
#train the model
model.fit(X_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: 

In [78]:
#evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 58.187562
