# **Multi-class Classification using a multilayered 1D convolution network in PyTorch**

**Prerequisites**
> First, connect to a GPU runtime (Runtime > Change runtime type > Hardware accelerator > GPU > Save, then connect)

> Next, let's import the following libraries to work with our dataset: 

In [0]:
# Import the pandas library to read our dataset
import pandas as pd

# Get the train/test split package from sklearn for preparing our dataset to
# train and test the model with
from sklearn.model_selection import train_test_split

# Import the numpy library to work with and manipulate the data
import numpy as np

**Processing our dataset**
> First, let's read and clean the dataset

In [0]:
# import data from google drive
data = pd.read_csv('https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv', sep='\t')

In [0]:
featuresofdata=data[['PhraseId', 'SentenceId','Phrase','Sentiment']] # please note I am not using sentiment. It is just for treating it as data.
labelsofdata=data[['Sentiment']]

In [0]:
# # split data in the beginning as per the instruction using train_test_split in 70:30 ration and random state 2003.
trainX, trainy,testX,testy=train_test_split(featuresofdata, labelsofdata, test_size=0.3,random_state=2003)

In [38]:
# Check the head of the dataframe
data.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [39]:
# check the shape of df
data.shape

(156060, 4)

In [0]:
# Get number of unique sentences
numSentences = data['SentenceId'].max()

In [0]:
# extract full sentences only from the dataset
fullSentences = []
curSentence = 0
for i in range(data.shape[0]):
  if data['SentenceId'][i]> curSentence:
    fullSentences.append((data['Phrase'][i], data['Sentiment'][i]))
    curSentence = curSentence +1

In [42]:
len(fullSentences)

8544

In [0]:
# put data into a df
fullSentDf = pd.DataFrame(fullSentences,
                                columns=['Phrase', 'Sentiment'])

In [44]:
# Check class imbalance in tokenized sentences
data['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [45]:
# Check class imbalance in full sentences
fullSentDf['Sentiment'].value_counts()

3    2325
1    2203
2    1659
4    1282
0    1075
Name: Sentiment, dtype: int64

In [46]:
import nltk
import random
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
documents = []
#convert data into format for the previous labs

#use full dataset
#for i in range(data.shape[0]):
#  tmpWords = word_tokenize(data['Phrase'][i])
#  documents.append((tmpWords, data['Sentiment'][i]))

# Use only complete sentences
for i in range(fullSentDf.shape[0]):
  tmpWords = word_tokenize(fullSentDf['Phrase'][i])
  documents.append((tmpWords, fullSentDf['Sentiment'][i]))

In [48]:
random.seed(9001)
random.shuffle(documents)
print(documents[1][0])

['For', 'its', '100', 'minutes', 'running', 'time', ',', 'you', "'ll", 'wait', 'in', 'vain', 'for', 'a', 'movie', 'to', 'happen', '.']


In [49]:
len(documents)

8544

In [50]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
porter = PorterStemmer()
lancaster=LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_en = stopwords.words("english")
punctuations="?:!.,;'\"-()"

#parameters to adjust to see the impact on outcome
remove_stopwords = True
useStemming = True
useLemma = False
removePuncs = True

for l in range(len(documents)):
  label = documents[l][1]
  tmpReview = []
  for w in documents[l][0]:
    newWord = w
    if remove_stopwords and (w in stopwords_en):
      continue
    if removePuncs and (w in punctuations):
      continue
    if useStemming:
      #newWord = porter.stem(newWord)
      newWord = lancaster.stem(newWord)
    if useLemma:
      newWord = wordnet_lemmatizer.lemmatize(newWord)
    tmpReview.append(newWord)
  documents[l] = (' '.join(tmpReview), label)
print(documents[2])

('ian holm conqu frant earthy napoleon', 3)


In [0]:
all_data = pd.DataFrame(documents,
                                columns=['text', 'sentiment'])
# Splits the dataset so 70% is used for training and 30% for testing
x_train_raw, x_test_raw, y_train_raw, y_test_raw = train_test_split(all_data['text'], all_data['sentiment'], test_size=0.3)

In [52]:
len(x_train_raw)

5980

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",
                             ngram_range=(1, 2))
#vectorizer = TfidfVectorizer(stop_words="english",
#                             ngram_range=(1, 2))
#X = vectorizer.fit_transform(all_data["text"])
#Y = all_data['sentiment']
x_train = vectorizer.fit_transform(x_train_raw)
y_train = y_train_raw
x_test = vectorizer.transform(x_test_raw)
y_test = y_test_raw

In [0]:
# Converts the datasets to numpy arrays to work with our PyTorch model
x_train_np = x_train.toarray()
y_train_np = np.array(y_train)

# Convert the testing data
x_test_np = x_test.toarray()
y_test_np = np.array(y_test)

In [55]:
x_train_np.shape

(5980, 50491)

**Creating the network**
> First, let's import the following libraries to build our network with:

In [0]:
# Import the pytorch library
import torch
torch.manual_seed(2003)

# Import the 1D convolution layer
# Since we’re inputting a 1-dimensional row of data, we can’t use 2D or 3D
from torch.nn import Conv1d

# Import the max pooling layer
from torch.nn import MaxPool1d

# Import the flatten layer
from torch.nn import Flatten

# Import the linear layer
from torch.nn import Linear

# Import the ReLU & Softmax activation function
from torch.nn.functional import relu, softmax, sigmoid

# Import the DataLoader and TensorDataset libraries from PyTorch
# to work with our datasets
from torch.utils.data import DataLoader, TensorDataset

> Next, let's define our model

In [0]:
# Our class MUST be a subclass of torch.nn.Module
class CnnClassifier(torch.nn.Module):
  # Define the initialization method
  def __init__(self, batch_size, inputs, outputs):

    # Initialize the superclass and store the parameters
    super(CnnClassifier, self).__init__()
    self.batch_size = batch_size
    self.inputs = inputs
    self.outputs = outputs

    # Define the input layer
    # (input channels, output channels, kernel size)
    self.input_layer = Conv1d(inputs, batch_size, 1)

    # Define a max pooling layer
    # (kernel size)
    self.max_pooling_layer = MaxPool1d(1)

    # Define another convolution layer
    self.conv_layer = Conv1d(batch_size, 128, 1)

    # Define a flatten layer
    self.flatten_layer = Flatten()

    # Define a linear layer
    # (inputs, outputs)
    self.linear_layer = Linear(128, 64)

    # Finally, define the output layer
    self.output_layer = Linear(64, outputs)

  # Define a method to feed inputs through the model
  def feed(self, input):
    # Reshape the entry so it can be fed to the input layer
    # Although we’re using 1D convolution, it still expects a 3D array to 
    # process in a 1D fashion
    input = input.reshape((self.batch_size, self.inputs, 1))

    # Get the output of the first layer and run it through the
    # the ReLU activation function
    output = relu(self.input_layer(input))

    # Get the output of the max pooling layer
    output = self.max_pooling_layer(output)

    # Get the output of the second convolution layer and run it
    # through the ReLU activation function
    output = relu(self.conv_layer(output))

    # Get the output of the flatten layer
    output = self.flatten_layer(output)

    # Get the output of the linear layer and run it through the
    # ReLU activation function
    output = self.linear_layer(output)

    # Finally, get the output of the output layer and return it
    output = self.output_layer(output)
    
    #We use softmax for multi-class classification
    output = softmax(output)
   
    return output

**Training the model**
> First, let's import the optimizer and performance measure we'll be using

In [0]:
# Import the SGD (stochastic gradient descent) package from pytorch for
# our optimizer
from torch.optim import SGD, Adam

# Import the L1Loss (mean absolute error loss) package from pytorch for
# our performance measure
from torch.nn import L1Loss, CrossEntropyLoss


> Next, let's define our model

In [59]:
# Define the batch size we'd like to use
batch_size = 256

# (batch size, X columns, Y columns)
model = CnnClassifier(batch_size, x_train.shape[1], 5)

# Set the model to use the GPU for processing
model.cuda()

CnnClassifier(
  (input_layer): Conv1d(50491, 256, kernel_size=(1,), stride=(1,))
  (max_pooling_layer): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
  (conv_layer): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
  (flatten_layer): Flatten()
  (linear_layer): Linear(in_features=128, out_features=64, bias=True)
  (output_layer): Linear(in_features=64, out_features=5, bias=True)
)

> Next, let's create a method for running the batches of data through our model (this is using code from the first lab)

In [0]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return (correct.sum() / torch.FloatTensor([y.shape[0]])) / torch.FloatTensor([y.shape[0]])

In [61]:
from sklearn.preprocessing import LabelBinarizer
indicies=range(5)
encoder=LabelBinarizer()
labels=encoder.fit_transform(indicies)
print(labels[1])

[0 1 0 0 0]


In [0]:
# This method will return the average L1 loss and R^2 score 
# of the passed model on the passed DataLoader
def model_loss(model, dataset, train = False, optimizer = None):
  # Cycle through the batches and get the average L1 loss
  performance = L1Loss()
  criterion = CrossEntropyLoss()
  avg_loss = 0
  avg_accu = 0
  count = 0
  
  for input, output in iter(dataset):
    # Get the model's predictions for the training dataset
    predictions = model.feed(input)
    
    # Get the model's loss
    loss = performance(predictions, output)
    
    tmp_accu = categorical_accuracy(predictions, output)
    
    if(train):
      # Clear any errors so they don't cummulate
      optimizer.zero_grad()

      # Compute the gradients for our optimizer
      loss.backward()

      # Use the optimizer to update the model's parameters based on the
      # gradients
      optimizer.step()

    # Store the loss and update the counter
    avg_loss += loss.item()

    # Accumulate performance metrices
    avg_accu += tmp_accu.item()
    count += 1
    
  return avg_loss / count, avg_accu / count



> Finally, let's train the model (this is using code from the first lab)



In [69]:
def performtraining():
  # Define the number of epochs to train for
  epochs = 10

  # Define the performance measure and optimizer
  optimizer = Adam(model.parameters(), lr=1e-5)
  #optimizer = Adam(model.parameters())

  # Convert the training set into torch variables for our model using the GPU
  # as floats. The reshape is to remove a warning pytorch outputs otherwise.
  inputs = torch.from_numpy(x_train_np).cuda().float()
  outputs = torch.from_numpy(y_train_np.reshape(y_train_np.shape[0], 1)).cuda().float()

  # Create a DataLoader instance to work with our batches
  tensor = TensorDataset(inputs, outputs)
  loader = DataLoader(tensor, batch_size, shuffle=True, drop_last=True)

  # Start the training loop
  for epoch in range(epochs):
    # Cycle through the batches and get the average loss
    avg_loss, avg_accu = model_loss(model, loader, train=True, optimizer=optimizer)
    print("Epoch " + str(epoch + 1) + ":\n\tLoss = " + str(avg_loss))
    print("Accuracy = " + str(avg_accu))
    
performtraining()  

  return F.l1_loss(input, target, reduction=self.reduction)


Epoch 1:
	Loss = 1.9084579063498455
Accuracy = 0.262398097826087
Epoch 2:
	Loss = 1.9099524591280066
Accuracy = 0.26154891304347827
Epoch 3:
	Loss = 1.9078804461852363
Accuracy = 0.26154891304347827
Epoch 4:
	Loss = 1.9080842474232549
Accuracy = 0.26290760869565216
Epoch 5:
	Loss = 1.906555730363597
Accuracy = 0.2630774456521739
Epoch 6:
	Loss = 1.9077785637067712
Accuracy = 0.26222826086956524
Epoch 7:
	Loss = 1.908118232436802
Accuracy = 0.2630774456521739
Epoch 8:
	Loss = 1.9087975854459016
Accuracy = 0.26154891304347827
Epoch 9:
	Loss = 1.9120584519013115
Accuracy = 0.26205842391304346
Epoch 10:
	Loss = 1.905298932738926
Accuracy = 0.26222826086956524


In [64]:
#saving model to disk
torch.save(model, '1099371_1dconv_reg.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [65]:
# loading model from disk
model = torch.load('1099371_1dconv_reg.pt')
model.eval()

CnnClassifier(
  (input_layer): Conv1d(50491, 256, kernel_size=(1,), stride=(1,))
  (max_pooling_layer): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
  (conv_layer): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
  (flatten_layer): Flatten()
  (linear_layer): Linear(in_features=128, out_features=64, bias=True)
  (output_layer): Linear(in_features=64, out_features=5, bias=True)
)

> Lastly, we will test the model to see how it performs on the testing dataset

In [68]:
def performtesting():
  # Convert the testing set into torch variables for our model using the GPU as floats
  inputs = torch.from_numpy(x_test_np).cuda().float()
  outputs = torch.from_numpy(y_test_np.reshape(y_test_np.shape[0], 1)).cuda().float()

  # Create a DataLoader instance to work with our batches
  tensor = TensorDataset(inputs, outputs)
  loader = DataLoader(tensor, batch_size, shuffle=True, drop_last=True)
  avg_loss, avg_accu = model_loss(model, loader)
  
  print("The model's L1 loss is: " + str(avg_loss))
  print("The model's Accuracy is: " + str(avg_accu))

performtesting()

  return F.l1_loss(input, target, reduction=self.reduction)


The model's L1 loss is: 1.9233594059944152
The model's Accuracy is: 0.248046875
