 # TP1. Fully Connected Networks
 
 #### Université Jean-Monnet, 2019-2020

## Part 3. Classification on Text Data(Sentiment Analysis)

In [39]:
import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from keras import optimizers

**Question 1: Load the Sentiment.csv file, only keep columns `text` and `sentiment` and print out first 10 rows.**

In [2]:
sentiment_data = pd.read_csv(r"F:\MLDM\3rd Semester\Deep Learning and Applications\Session 1\TP\data\Sentiment.csv")
sentiment_data = sentiment_data[["text", "sentiment"]]
sentiment_data[:10]

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
5,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",Positive
6,RT @warriorwoman91: I liked her and was happy ...,Negative
7,Going on #MSNBC Live with @ThomasARoberts arou...,Neutral
8,Deer in the headlights RT @lizzwinstead: Ben C...,Negative
9,RT @NancyOsborne180: Last night's debate prove...,Negative


**Question 2. Remove all rows with label Neutral**

In [3]:
sentiment_data = sentiment_data[sentiment_data.sentiment != "Neutral"]
sentiment_data

Unnamed: 0,text,sentiment
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
5,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",Positive
6,RT @warriorwoman91: I liked her and was happy ...,Negative
...,...,...
13866,RT @cappy_yarbrough: Love to see men who will ...,Negative
13867,RT @georgehenryw: Who thought Huckabee exceede...,Positive
13868,"RT @Lrihendry: #TedCruz As President, I will a...",Positive
13869,RT @JRehling: #GOPDebate Donald Trump says tha...,Negative


**Question 3. Print the number of Positive and Negative rows**

In [4]:
positive = sentiment_data[sentiment_data.sentiment == "Positive"]
negative = sentiment_data[sentiment_data.sentiment == "Negative"]
print(np.shape(positive))
print(np.shape(negative))

(2236, 2)
(8493, 2)


We see that the number of Negative rows is higher than the number of Positive rows. Today, we only forcus on balanced data, and so we would like to make the two equal.

**Question 4. Remove some Negative rows so that #Positive and Negative rows are equal**

In [5]:
negative = negative[:2236]
print(np.shape(positive))
print(np.shape(negative))
balanced_sentiment_data = pd.concat([positive, negative])

print(np.shape(balanced_sentiment_data))
balanced_sentiment_data = balanced_sentiment_data.sample(frac=1)
print(balanced_sentiment_data[:10])
data = balanced_sentiment_data

(2236, 2)
(2236, 2)
(4472, 2)
                                                    text sentiment
10290  “@msgoddessrises: There's some Cakebread in th...  Positive
1229   #GOPDebate Such flailing&amp;whining about "po...  Negative
10315  All this science talk is hot!!!! @BenCarson201...  Positive
2995   RT @TheBaxterBean:  Fact: Scott Walker Lies Mo...  Negative
3192   Imagine a #Trump Administration dealing with a...  Negative
12873  Wow #Trump you've taking advantage of the laws...  Positive
12111  RT @RWSurferGirl: The candidates don't have to...  Positive
390    @ChrisChristie @RandPaul Americans fear NOT. P...  Negative
12418  RT @RWSurferGirl: Thanks Fox News, you're rais...  Positive
707    How everyone watching the #GOPDebate last nigh...  Negative


We convert all data into lower case and remove all special characters

In [7]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

Here are two examples of the texts

In [8]:
print(data.iloc[1].text)
print(data.iloc[2].text)
print(data['text'])

gopdebate such flailingampwhining about political correctness as a way to avoid answering any substantive q on warimmigrationmisogyny
all this science talk is hot bencarson2016 gopdebates
10290    msgoddessrises theres some cakebread in the fr...
1229     gopdebate such flailingampwhining about politi...
10315    all this science talk is hot bencarson2016 gop...
2995       thebaxterbean  fact scott walker lies more o...
3192     imagine a trump administration dealing with a ...
                               ...                        
2455       spwmthe3rd polisciumn did ted cruz just miss...
3655       jjauthor now if only realdonaldtrump will go...
8899       rwsurfergirl trump has got it right nobody w...
3781       swayzeguy foxnews hosts were fist bumping ea...
11079      lrihendry tedcruz headed into the presidenti...
Name: text, Length: 4472, dtype: object


We would like to map each sentence to an array of tokens, each word is a token. To make the array having fixed length, we pad enough 0 to the begining of each array

In [9]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)

X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [10]:
X.shape
print(tokenizer.word_index)

{'gopdebate': 1, 'the': 2, 'to': 3, 'a': 4, 'and': 5, 'of': 6, 'i': 7, 'is': 8, 'gopdebates': 9, 'trump': 10, 'in': 11, 'for': 12, 'on': 13, 'it': 14, 'you': 15, 'realdonaldtrump': 16, 'this': 17, 'was': 18, 'that': 19, 'not': 20, 'about': 21, 'amp': 22, 'last': 23, 'at': 24, 'debate': 25, 'he': 26, 'be': 27, 'rwsurfergirl': 28, 'foxnews': 29, 'night': 30, 'have': 31, 'my': 32, 'are': 33, 'but': 34, 'as': 35, 'megynkelly': 36, 'with': 37, 'like': 38, 'they': 39, 'fox': 40, 'gop': 41, 'from': 42, 'up': 43, 'who': 44, 'do': 45, 'would': 46, 'if': 47, 'what': 48, 'candidates': 49, 'when': 50, 'all': 51, 'news': 52, 'rubio': 53, 'has': 54, 'so': 55, 'how': 56, 'think': 57, 'will': 58, 'we': 59, 'just': 60, 'get': 61, 'his': 62, 'cruz': 63, 'im': 64, 'me': 65, 'tedcruz': 66, 'more': 67, 'by': 68, 'its': 69, 'one': 70, 'need': 71, 'no': 72, 'donald': 73, 'out': 74, 'your': 75, 'said': 76, 'did': 77, 'can': 78, 'only': 79, 'carson': 80, 'job': 81, 'people': 82, 'time': 83, 'these': 84, 'pa': 

So the length of each array is 29. Here is the array corresponding to the two sentences above

In [11]:
print(X[0])
print(X[1])
print(data.iloc[1].text)
print(data.iloc[2].text)

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 1355  618  167   11    2    9  353   14 1356 1946  110  272  141
  127]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    1  812   21  236  561   35    4  267    3 1586 1049  142  949 1180
   13]
gopdebate such flailingampwhining about political correctness as a way to avoid answering any substantive q on warimmigrationmisogyny
all this science talk is hot bencarson2016 gopdebates


**Question 5. Make label data corresponding to X**

In [12]:
y =  pd.get_dummies(data.sentiment)
# y = to_categorical(y)
print(y[:3])
print(np.shape(y))

       Negative  Positive
10290         0         1
1229          1         0
10315         0         1
(4472, 2)


**Question 6. Split train/test sets randomly with ratio 2:1**

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

**Question 7. Build a quick Fully Connected network to obtain 55\% accuracy on test data**

In [47]:
model = Sequential()
model.add(Dense(12, input_dim=29, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(12, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_62 (Dense)             (None, 12)                360       
_________________________________________________________________
dropout_49 (Dropout)         (None, 12)                0         
_________________________________________________________________
dense_63 (Dense)             (None, 12)                156       
_________________________________________________________________
dropout_50 (Dropout)         (None, 12)                0         
_________________________________________________________________
dense_64 (Dense)             (None, 2)                 26        
Total params: 542
Trainable params: 542
Non-trainable params: 0
_________________________________________________________________
None


In [48]:
model.fit(X_train, y_train,
          epochs=50,
          batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x222a99d0940>

In [49]:
_, train_acc = model.evaluate(X_train, y_train, batch_size=1000)
_, test_acc = model.evaluate(X_test, y_test, batch_size=1000)
print('train_acc', train_acc)
print('test_acc', test_acc)

train_acc 0.5720961093902588
test_acc 0.5697832107543945


**Question 8. Improve the architecture to achieve 65\% accuracy**

In [2]:
# model = Sequential()
# model.add(Dense(24, input_dim=29, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(12, activation='relu'))
# model.add(Dropout(0.1))
# model.add(Dense(6, activation='relu'))
# model.add(Dense(2, activation='softmax'))

# model.compile(loss='categorical_crossentropy',
#               optimizer='RMSprop',
#               metrics=['accuracy'])
# print(model.summary())

# model = Sequential()
# model.add(Dense(50, input_dim=29, activation='relu'))
# model.add(Dropout(0.3))

# model.add(Dense(2, activation='softmax'))

# sgd = optimizers.Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)

model = Sequential()
model.add(Dense(32, input_dim=29, activation='relu'))
# model.add(Dropout(0.03))
# model.add(Dense(17, input_dim=29, activation='relu'))
# model.add(Dense(20, input_dim=29, activation='relu'))

model.add(Dense(10, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer="AdaDelta",
#               optimizer="adam",
              metrics=['accuracy'])
print(model.summary())

NameError: name 'Sequential' is not defined

In [277]:
model.fit(X_train, y_train,
          epochs=100,
          batch_size=128)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x222f225ccf8>

In [278]:
_, train_acc = model.evaluate(X_train, y_train, batch_size=16)
_, test_acc = model.evaluate(X_test, y_test, batch_size=16)
print('train_acc', train_acc)
print('test_acc', test_acc)

train_acc 0.6992656588554382
test_acc 0.5962059497833252


In [279]:
model = Sequential()
model.add(Dense(32, input_dim=29, activation='relu'))
model.add(Dropout(0.03))

model.add(Dense(10, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer="AdaDelta",
#               optimizer="adam",
              metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train,
          epochs=100,
          batch_size=128)

Model: "sequential_97"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_299 (Dense)            (None, 32)                960       
_________________________________________________________________
dropout_134 (Dropout)        (None, 32)                0         
_________________________________________________________________
dense_300 (Dense)            (None, 10)                330       
_________________________________________________________________
dense_301 (Dense)            (None, 2)                 22        
Total params: 1,312
Trainable params: 1,312
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 

<keras.callbacks.callbacks.History at 0x222f503c4e0>

In [280]:
_, train_acc = model.evaluate(X_train, y_train, batch_size=16)
_, test_acc = model.evaluate(X_test, y_test, batch_size=16)
print('train_acc', train_acc)
print('test_acc', test_acc)

train_acc 0.692923903465271
test_acc 0.574525773525238
