Exploring The Data


In [0]:
# Import the pandas library to read  dataset
import pandas as pd
# Get the package from sklearn for preparing  dataset to train and test the model with
from sklearn.model_selection import train_test_split
#Import the numpy library to work with and manipulate the data
import numpy as np
#Import the nltk to work with data preprocessing
import nltk 

Mount Drive

In [0]:
#mount google drive into project
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reading Dataset


In [0]:
#read the dataset from google drive
dataset =  pd.read_csv('/content/drive/My Drive/train.tsv', sep='\t')
dataset = dataset.dropna()
#show first 5 rows of the dataset
dataset.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [0]:

dataset.shape 

(156060, 4)

In [0]:
#shows the values of each sentiments
dataset.Sentiment.value_counts()


2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

Adjustable Parameters

In [0]:
#parameters to adjust to see the impact on outcome
remove_fPunct = True
fTokenizaton = True
fStopwords = True
fStemming = False
fLemmatization = True



Data Cleaning | Punctuations

In [0]:
#print the string punctuation
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [0]:
# remove the punctuations
import nltk 
def remove_punctuation(text):
  text_nonpunctations = "".join([a for a in text if a not in string.punctuation])
  return text_nonpunctations

In [0]:
if remove_fPunct:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: remove_punctuation(x)) 

Data Cleaning | Tokenization

In [0]:
#tokenization
import re

def tokenize(text):
  tokens = re.split('\W+', text)
  return tokens 

if fTokenizaton:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: tokenize(x.lower()))

In [0]:
#remove the stopwords
import nltk 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [0]:
def remove_stopwords(txt_tokenized):
  txt_clean = [word for word in txt_tokenized if word not in stopwords]
  return txt_clean

if fStopwords:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: remove_stopwords(x))

Data Cleaning | Stemming

In [0]:
#stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [0]:
def stemming(tokenized_text):
  text = [ps.stem(word) for word in tokenized_text]
  return text

In [0]:
if fStemming:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: stemming(x))

Data Cleaning | Lemmatization

In [0]:
#lemmatization
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

def lemmatization(token_txt):
  text = [wn.lemmatize(word) for word in token_txt]
  return text


In [0]:
if fLemmatization:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: lemmatization(x))

In [0]:
dataset.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"[series, escapade, demonstrating, adage, good,...",1
1,2,1,"[series, escapade, demonstrating, adage, good,...",2
2,3,1,[series],2
3,4,1,[],2
4,5,1,[series],2
5,6,1,"[escapade, demonstrating, adage, good, goose]",2
6,7,1,[],2
7,8,1,"[escapade, demonstrating, adage, good, goose]",2
8,9,1,[escapade],2
9,10,1,"[demonstrating, adage, good, goose]",2


Splitting The Dataset

In [0]:
#split the dataset into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(dataset['Phrase'], dataset['Sentiment'], test_size=0.3, random_state=2003)
documents = []
X_train = np.array(X_train.values.tolist())
Y_train = np.array(Y_train.values.tolist())
for i in range(len(X_train)):
  documents.append([list(X_train[i]), Y_train[i]]) 

X_test = np.array(X_test.values.tolist())
Y_test = np.array(Y_test.values.tolist())
for i in range(len(X_test)):
  documents.append([list(X_test[i]), Y_test[i]]) 

print(documents[0][0])

dataset = pd.DataFrame(documents, columns=['text', 'sentiment']) 
dataset['join'] = dataset.text.apply(' '.join)
dataset.head()

['age']


Unnamed: 0,text,sentiment,join
0,[age],2,age
1,"[gorgeous, epic]",4,gorgeous epic
2,"[fan, grossout, comedy]",2,fan grossout comedy
3,"[filmmaker, ascends, literally, olympus, art, ...",4,filmmaker ascends literally olympus art world
4,"[twisting, mystery]",2,twisting mystery


In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset['join'],  dataset['sentiment'], test_size=0.3, random_state=2003)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer 
from keras.utils import to_categorical

vectorizer = TfidfVectorizer(max_features = 2500)#, # ngram_range=(1, 1)) 
X = vectorizer.fit_transform(dataset["join"]) 
Y = dataset['sentiment'] 
 
X_train = vectorizer.transform(X_train).toarray()
Y_train = Y_train 
X_test = vectorizer.transform(X_test).toarray()
Y_test = Y_test



In [0]:
Y_test

13510     2
61932     0
82549     1
137718    3
121990    2
         ..
94224     2
135456    2
154729    1
23031     1
57870     2
Name: sentiment, Length: 46818, dtype: int64

In [0]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras import backend as K

In [0]:
X_train.shape

(109242, 2500)

In [0]:
Y_train = keras.utils.to_categorical(Y_train, num_classes)
Y_test = keras.utils.to_categorical(Y_test, num_classes)
Y_test

array([[0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [0]:
from keras import backend as K
#method to count the recall
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
#method to count the precision 
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
#method to count the f1 score
def f1(y_true, y_pred):
    precision = precision_measure(y_true, y_pred)
    recall = recall_measure(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [0]:
#defining the model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3,
                 activation='relu',
                 input_shape=(2500,1)))

#convolution layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
#max pooling layer
model.add(MaxPooling1D(pool_size=1))

model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=1))

#dropout layer
model.add(Dropout(rate = 0.25))
#flattern layer
model.add(Flatten())

#dence layer
model.add(Dense(10, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
#print the model summary for understanding each layer
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_29 (Conv1D)           (None, 2498, 64)          256       
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 2496, 64)          12352     
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 2496, 64)          0         
_________________________________________________________________
conv1d_31 (Conv1D)           (None, 2494, 64)          12352     
_________________________________________________________________
conv1d_32 (Conv1D)           (None, 2492, 64)          12352     
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 2492, 64)          0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 2492, 64)         

In [0]:
#defining the learning rate
learning_rate=0.001
#defining the optimizer
Optimizer=keras.optimizers.Adam(lr=learning_rate)
#compile the model
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=Optimizer,
              metrics=['accuracy',f1,precision,recall])

In [0]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [0]:
batchsize=64
Epochs=25
model.fit(X_train, Y_train,
          batch_size=batchsize,
          epochs=Epochs)

score = model.evaluate(X_test, Y_test, verbose=0)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('F1 score:', score[2])
print('precision:', score[3])
print('recall', score[4])

Test loss: 1.005148004602503
Test accuracy: 0.6132684010423342
F1 score: 0.5907722033579055
precision: 0.6460010933394291
recall 0.545837071211927


In [0]:
#save the model
from keras.models import load_model

model.save('my_model.h5')