<a href="https://colab.research.google.com/github/miguelangel43/IS_NLP_Assignment/blob/main/NLP_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving yelp_20200.csv to yelp_20200.csv


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Read dataset 
df = pd.read_csv("yelp_20200.csv")

# Separate the text and the labels
labels = df['stars'].tolist() 
text_unprocessed = df['text'].tolist() 

In [None]:
# --------------------------------------------------------------------
# Preprocessing the text
# --------------------------------------------------------------------

# 1. Tokenization
tokenizer = RegexpTokenizer(r'\w+') # The pattern (r'\w') also removes punctuation
#df['Review'] = df['Review'].progress_apply(lambda x: tokenizer.tokenize(x.lower())) # Apply(): applies a function along an axis of the DataFrame.
texts_tok = []
for x in tqdm(text_unprocessed):
    texts_tok.append(tokenizer.tokenize(x.lower()))

100%|██████████| 20200/20200 [00:00<00:00, 22114.64it/s]


In [None]:
# 2. Remove stop words
stwords = set(stopwords.words('english')[: -63]) # Don't remove useful words for sentiment analysis
def remove_stopwords(text):
    return [w for w in text if w not in stwords]
#df['Review'] = df['Review'].progress_apply(lambda x : remove_stopwords(x))
texts_st = []
for x in tqdm(texts_tok):
    texts_st.append(remove_stopwords(x))
# Reduce the number of words that we take from each review bc of memory issues 
texts_st = [text[:20] for text in texts_st]


100%|██████████| 20200/20200 [00:00<00:00, 50365.32it/s]


In [None]:
# 3. Lemmatization
lemmatizer = WordNetLemmatizer()
def word_lemmatizer(text):
    return [lemmatizer.lemmatize(i) for i in text]
#tqdm.pandas(tqdm_notebook)
#df['Review'] = df['Review'].progress_apply(lambda x : word_lemmatizer(x))
texts_lem = []
for x in tqdm(texts_st):
    texts_lem.append(word_lemmatizer(x))

100%|██████████| 20200/20200 [00:01<00:00, 12459.90it/s]


In [None]:
# 4. Stemming
stemmer = PorterStemmer()
def word_stemmer(text):
    return [stemmer.stem(i) for i in text]
# tqdm.pandas(tqdm_notebook)
# df['Review'] = df['Review'].progress_apply(lambda x: word_stemmer(x))
texts = []
for x in tqdm(texts_lem):
    texts.append(word_stemmer(x))

100%|██████████| 20200/20200 [00:08<00:00, 2486.35it/s]


In [None]:
len(labels_test)

200

In [None]:
# --------------------------------------------------------------------
# Preparing the input for training
# --------------------------------------------------------------------

SIZE_TRAIN_DATASET = 20000
SIZE_TEST_DATASET = 200

texts_train = texts[:SIZE_TRAIN_DATASET]
labels_train = labels[:SIZE_TRAIN_DATASET]
texts_test = texts[SIZE_TRAIN_DATASET:SIZE_TRAIN_DATASET+SIZE_TEST_DATASET]
labels_test = labels[SIZE_TRAIN_DATASET:SIZE_TRAIN_DATASET+SIZE_TEST_DATASET]

def word2indexMapping(textfile):
    w2i = {}
    text_wo_dup = list(dict.fromkeys(textfile))
    i = 0
    for word in text_wo_dup:
        w2i[word] = i
        i += 1
    return w2i

texts_words = [val for sublist in texts_train for val in sublist] # We put each word in a list to then make a w2i out of it
w2i = word2indexMapping(texts_words) #textfile is all the reviews put together

def onehot(length, position): # We create this method because np.eye or np.identity consumes too much memory
    l = []
    for i in range(length):
        if position == i:
            l.append(1)
        else:
            l.append(0)
    return np.array(l)

#https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np

class YelpDataset(Dataset):

    def __init__(self, texts, labels, w2i, max_length=20): 
        self.texts = texts
        self.labels = labels
        self.w2i = w2i
        self.length_dict = len(self.w2i)
        self.max_length = max_length
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        data = np.array([onehot(self.length_dict, self.w2i[word]) for word in self.texts[idx] if word in self.w2i]).astype(np.float32)
        if len(data) > self.max_length:
            data = data[:self.max_length].astype(np.float32)
        elif 0 < len(data) < self.max_length:
            data = np.concatenate((data, np.zeros((self.max_length - len(data), self.length_dict), dtype=np.float32)))
        elif len(data) == 0:
            data = np.zeros((self.max_length, self.length_dict), dtype=np.float32)
        label = self.labels[idx]
        return data, label

yelp_dataset_train = YelpDataset(texts=texts_train, labels=labels_train, w2i=w2i)
yelp_dataset_test = YelpDataset(texts=texts_test, labels=labels_test, w2i=w2i)

In [None]:
# --------------------------------------------------------------------
# Convolutional Neural Network
# --------------------------------------------------------------------

import torch.nn as nn
import torch.nn.functional as F

class ConvolutionalNeuralNetwork(nn.Module):

    def __init__(self, num_classes=5, input_dim=yelp_dataset_train.length_dict, input_length=20):
        super(ConvolutionalNeuralNetwork, self).__init__()
        # input channel (no. columns of sentence matrix),  256 output channels, 1x7 convolution
        self.conv1 = nn.Conv1d(input_length, 256, kernel_size=7)
        self.conv2 = nn.Conv1d(256, 256, kernel_size=7)
        
        self.fc1 = nn.Linear(456192, num_classes)
        
    def forward(self, x):
        x = F.max_pool1d(F.relu(self.conv1(x)), 3)
        # If the size is a square you can only specify a single number
        x = F.max_pool1d(F.relu(self.conv2(x)), 3)
        x = x.view(-1, self.num_flat_features(x))
        x = self.fc1(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
# %%
cnn = ConvolutionalNeuralNetwork()
device = torch.device("cuda:0")
cnn.to(device)
# %%
from torch.optim import Adam
from IPython.display import clear_output
#from tqdm import tqdm

def trainCNN(model, train_dataset, epochs=3, batchsize=32, learning_rate=0.0001):
    # write your code snippet here
    train_dataloader = DataLoader(train_dataset, batch_size=batchsize)
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    criterion = nn.CrossEntropyLoss()
    
    for i in range(epochs):
        print('epoch: ', i+1)
        counter = 0
        for inp,labels in train_dataloader:
            clear_output()
            print(counter)
            counter += 1
            inp, labels = inp.to(device), labels.to(device)
            
            out = model(inp.float())
            loss = criterion(out, labels-1)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
    print('training succesful!')

trainCNN(cnn, yelp_dataset_train)

624
training succesful!


In [None]:
#torch.save(cnn.state_dict(), 'cnn_t1.pt')

In [None]:
#files.download('cnn_t1.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# --------------------------------------------------------------------
# Testing
# --------------------------------------------------------------------

def test(model, test_dataset, batchsize=1):
    print('Testing...')
    test_dataloader = DataLoader(test_dataset, batch_size=batchsize)
    correct = 0
    total = 0

    for data, label in test_dataloader:
        data, label = data.to(device), label.to(device)
        out = model(data.float())
        if torch.argmax(out) == label-1:
            correct += 1
        total += 1
    print('Correct: ', correct)
    print('Total: ', total)
    print('Accuracy: ', correct/total)

In [None]:
model = ConvolutionalNeuralNetwork()
model.load_state_dict(torch.load("cnn_t1.pt"))
model.eval()
model.to(device)

In [None]:
# Testing CNN
test(cnn, yelp_dataset_test)

Testing...
Correct:  76
Total:  200
Accuracy:  0.38
