# Sentiment Prediction Using Deep Learning - Convolutional Neural Network

## Importing data

In [1]:
import pandas as pd

df = pd.read_csv("./data/dataset.csv", header = None, encoding='latin-1', names=["Sentiment", "Headlines"])
df['Sentiment'] = df['Sentiment'].replace("negative",0).replace("neutral",1).replace("positive",2)
df

Unnamed: 0,Sentiment,Headlines
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...
...,...,...
4841,0,LONDON MarketWatch -- Share prices ended lower...
4842,1,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,0,Operating profit fell to EUR 35.4 mn from EUR ...
4844,0,Net sales of the Paper segment decreased to EU...


In [2]:
df.Sentiment.value_counts()

1    2879
2    1363
0     604
Name: Sentiment, dtype: int64

In [3]:
import sys
sys.path.insert(0, './lib')
from sentiment_module import *

df['Splitted'] = df['Headlines'].apply(lambda x: cleaning_data(x))

[nltk_data] Downloading package stopwords to C:\Users\Long's
[nltk_data]     XPS13\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df.head(5)

Unnamed: 0,Sentiment,Headlines,Splitted
0,1,"According to Gran , the company has no plans t...","[according, to, gran, the, company, has, no, p..."
1,1,Technopolis plans to develop in stages an area...,"[technopolis, plans, to, develop, in, stages, ..."
2,0,The international electronic industry company ...,"[the, international, electronic, industry, com..."
3,2,With the new production plant the company woul...,"[with, the, new, production, plant, the, compa..."
4,2,According to the company 's updated strategy f...,"[according, to, the, company, s, updated, stra..."


## Splitting data

In [5]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

In [6]:
training_words = [word for tokens in df_train['Splitted'] for word in tokens]

In [7]:
training_words[0:10]

['the',
 'third',
 'quarter',
 'result',
 'also',
 'includes',
 'a',
 'euro',
 'provision',
 'for']

In [8]:
test_words = [word for tokens in df_test['Splitted'] for word in tokens]

In [9]:
test_words[0:10]

['the',
 'bristol',
 'port',
 'company',
 'has',
 'sealed',
 'a',
 'one',
 'million',
 'pound']

## Loading Google News Word2Vec model 

In [10]:
from gensim import models
word2vec_path = './data/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

## Tokenizing & Pad Sequencing

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=len(training_words), char_level=False)
tokenizer.fit_on_texts(df_train['Headlines'].tolist())
train_sequences = tokenizer.texts_to_sequences(df_train['Headlines'].tolist())
train_word_index = tokenizer.word_index
max_seq_len = max([len(x) for x in train_sequences])

# Pad Sequence
train_data = pad_sequences(train_sequences, maxlen=max_seq_len)

In [12]:
train_data.shape

(3876, 71)

In [13]:
test_sequences = tokenizer.texts_to_sequences(df_test['Headlines'].tolist())

# Pad Sequence
test_data = pad_sequences(test_sequences, maxlen=max_seq_len)

## Embedding

In [14]:
import numpy as np

train_embeddings = np.zeros((len(train_word_index)+1, 300))
for word,index in train_word_index.items():
    train_embeddings[index,:] = word2vec[word] if word in word2vec else np.random.rand(300)

In [15]:
train_embeddings.shape

(9020, 300)

## CNN Modeling

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding

In [17]:
from keras.utils import np_utils
X_train = train_data
# y_train = df_train['Sentiment']
X_test = test_data
# y_test = df_test['Sentiment']
y_train = np_utils.to_categorical(df_train['Sentiment'], num_classes=3)
y_test = np_utils.to_categorical(df_test['Sentiment'], num_classes=3)

In [18]:
model = Sequential()

model.add(Embedding(input_dim=len(train_word_index)+1, 
                    output_dim=300, 
                    weights=[train_embeddings], 
                    input_length=max_seq_len, 
                    trainable=False))
model.add(Conv1D(filters = 200, kernel_size = 3, padding='valid', activation = 'relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x17bd5bb9dc0>

In [19]:
print(X_train.shape)
print(y_train.shape)

(3876, 71)
(3876, 3)


In [20]:
model.evaluate(x=X_test, y=y_test, batch_size=None, verbose=1, sample_weight=None)



[0.8279296159744263, 0.7845360636711121]

# Predicting

## CNBC data

### Headlines

In [21]:
%store -r df1
hl1_sequences = tokenizer.texts_to_sequences(df1['Headlines'].tolist())
# Pad Sequence
hl1_data = pad_sequences(hl1_sequences, maxlen=max_seq_len)
y_pred_hl1 = model.predict(hl1_data)

In [22]:
y_pred_hl1[0:10]

array([[1.30593675e-04, 1.06171094e-04, 9.99763191e-01],
       [8.76965292e-04, 9.98603165e-01, 5.19873225e-04],
       [8.57736841e-02, 5.61626395e-03, 9.08610046e-01],
       [2.43258327e-02, 6.52187407e-01, 3.23486745e-01],
       [9.72132385e-03, 9.69221056e-01, 2.10576281e-02],
       [4.05852776e-03, 9.95160997e-01, 7.80438771e-04],
       [2.70037475e-04, 9.99677300e-01, 5.26908443e-05],
       [2.91122943e-01, 7.08683908e-01, 1.93108179e-04],
       [1.52586075e-02, 5.91985881e-01, 3.92755538e-01],
       [7.84892961e-03, 8.89970481e-01, 1.02180615e-01]], dtype=float32)

In [23]:
from sentiment_module import cluster_extraction
hl_sentiment = cluster_extraction(y_pred_hl1)

In [24]:
hl_sentiment[0:10]

[2, 1, 2, 1, 1, 1, 1, 1, 1, 1]

### Description

In [25]:
ds1_sequences = tokenizer.texts_to_sequences(df1['Description'].tolist())
# Pad Sequence
ds1_data = pad_sequences(ds1_sequences, maxlen=max_seq_len)
y_pred_ds1 = model.predict(ds1_data)

In [26]:
y_pred_ds1[0:10]

array([[8.2729312e-06, 9.9977845e-01, 2.1337251e-04],
       [2.4510868e-04, 9.9876726e-01, 9.8757981e-04],
       [1.2591290e-03, 9.9517488e-01, 3.5658858e-03],
       [7.2648941e-04, 9.9526066e-01, 4.0129069e-03],
       [9.7124004e-01, 9.4486261e-03, 1.9311352e-02],
       [2.4510868e-04, 9.9876726e-01, 9.8757981e-04],
       [1.4582376e-05, 9.9805403e-01, 1.9314173e-03],
       [9.8307067e-01, 1.5465150e-02, 1.4641940e-03],
       [1.8112676e-03, 6.0057271e-01, 3.9761603e-01],
       [1.6419889e-03, 7.7933502e-01, 2.1902300e-01]], dtype=float32)

In [27]:
ds_sentiment = cluster_extraction(y_pred_ds1)
ds_sentiment[0:10]

[1, 1, 1, 1, 0, 1, 1, 0, 1, 1]

### Combine

In [28]:
cnn_c_sentiment = combine_sentiments(hl_sentiment, ds_sentiment)
cnn_c_sentiment[0:10]

[2, 1, 2, 1, 0, 1, 1, 0, 1, 1]

In [29]:
# storing data for the result dataframe
%store -r final_df1
final_df1['cnn_sentiment'] = cnn_c_sentiment
%store final_df1

Stored 'final_df1' (DataFrame)


## Reuters data

### Headlines

In [30]:
%store -r df2
hl2_sequences = tokenizer.texts_to_sequences(df2['Headlines'].tolist())
# Pad Sequence
hl2_data = pad_sequences(hl2_sequences, maxlen=max_seq_len)
y_pred_hl2 = model.predict(hl2_data)

In [31]:
y_pred_hl2[0:10]

array([[1.2337724e-05, 9.9997222e-01, 1.5466285e-05],
       [9.8997140e-01, 4.3338984e-03, 5.6946725e-03],
       [6.0721417e-03, 9.7219616e-01, 2.1731671e-02],
       [1.2982792e-01, 8.3164650e-01, 3.8525585e-02],
       [8.3935866e-03, 8.5302156e-01, 1.3858491e-01],
       [2.3936755e-03, 9.9640840e-01, 1.1979013e-03],
       [9.9466473e-04, 9.9819750e-01, 8.0779655e-04],
       [1.2796985e-01, 7.4257767e-01, 1.2945250e-01],
       [3.2251229e-04, 9.9966753e-01, 9.9892359e-06],
       [1.3614590e-01, 1.3370870e-01, 7.3014539e-01]], dtype=float32)

In [32]:
from sentiment_module import cluster_extraction
hl_sentiment = cluster_extraction(y_pred_hl2)

In [33]:
hl_sentiment[0:10]

[1, 0, 1, 1, 1, 1, 1, 1, 1, 2]

### Description

In [34]:
ds2_sequences = tokenizer.texts_to_sequences(df2['Description'].tolist())
# Pad Sequence
ds2_data = pad_sequences(ds2_sequences, maxlen=max_seq_len)
y_pred_ds2 = model.predict(ds2_data)

In [35]:
y_pred_ds2[0:10]

array([[2.1355215e-02, 6.8908483e-01, 2.8955993e-01],
       [4.9470669e-01, 4.0708533e-01, 9.8208003e-02],
       [6.2387860e-01, 3.7132826e-01, 4.7931010e-03],
       [1.4085230e-02, 9.8535776e-01, 5.5695890e-04],
       [3.0549955e-02, 1.8481511e-01, 7.8463489e-01],
       [9.7510433e-01, 2.4155397e-02, 7.4017962e-04],
       [1.2952825e-01, 2.3824328e-03, 8.6808926e-01],
       [1.0938160e-02, 9.8649967e-01, 2.5621513e-03],
       [3.0353493e-03, 9.5955366e-01, 3.7410989e-02],
       [9.4615109e-03, 1.1750282e-04, 9.9042094e-01]], dtype=float32)

In [36]:
ds_sentiment = cluster_extraction(y_pred_ds2)
ds_sentiment[0:10]

[1, 0, 0, 1, 2, 0, 2, 1, 1, 2]

### Combine

In [37]:
cnn_r_sentiment = combine_sentiments(hl_sentiment, ds_sentiment)
cnn_r_sentiment[0:10]

[1, 0, 0, 1, 2, 0, 2, 1, 1, 2]

In [38]:
# storing data for the result dataframe
%store -r final_df2
final_df2['cnn_sentiment'] = cnn_r_sentiment
%store final_df2

Stored 'final_df2' (DataFrame)


## The Guardian data

### Headlines

In [39]:
%store -r df3
hl3_sequences = tokenizer.texts_to_sequences(df3['Headlines'].tolist())
# Pad Sequence
hl3_data = pad_sequences(hl3_sequences, maxlen=max_seq_len)
y_pred_hl3 = model.predict(hl3_data)

In [40]:
y_pred_hl3[0:10]

array([[1.0293522e-02, 9.8063022e-01, 9.0762125e-03],
       [1.6725290e-04, 9.9478418e-01, 5.0486671e-03],
       [2.9536288e-05, 9.9613994e-01, 3.8304848e-03],
       [3.7760653e-03, 9.5738083e-01, 3.8843133e-02],
       [2.8704008e-04, 9.9109811e-01, 8.6148502e-03],
       [2.7328570e-05, 9.8566890e-01, 1.4303677e-02],
       [1.7643274e-01, 8.2212013e-01, 1.4471314e-03],
       [1.8365324e-02, 9.0260494e-01, 7.9029776e-02],
       [4.3315251e-04, 9.9953413e-01, 3.2760443e-05],
       [8.1195042e-04, 9.9402314e-01, 5.1649171e-03]], dtype=float32)

In [41]:
from sentiment_module import cluster_extraction
hl_sentiment = cluster_extraction(y_pred_hl3)

In [42]:
hl_sentiment[0:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

### Combine

In [43]:
cnn_g_sentiment = hl_sentiment
cnn_g_sentiment[0:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [45]:
# storing data for the result dataframe
%store -r final_df3
final_df3['cnn_sentiment'] = cnn_g_sentiment
%store final_df3

Stored 'final_df3' (DataFrame)


In [46]:
final_df1.to_csv("./output/cnbc.csv", index = False)
final_df2.to_csv("./output/reuters.csv", index = False)
final_df3.to_csv("./output/guardian.csv", index = False)