# <span style="color:turquoise">Text classification with pytorch</span>


An example of using natural language processing for sentiment analysis. <br> Building a binary classifier of movie reviews that will predict if a review is positive or negative.




__Dataset:__ IMDB movie reviews from Kaggle<br>
__Model:__ bag-of-words + RNN(?)


### <span style="color:teal">Todo:</span>

- ~~Read dataset~~
- ~~Preprocess text~~
- ~~Split into train, validation, and test sets~~
- Vectorize text
- Make model
- Make training function
- Make evaluation function
- Train
- Evaluate

In [82]:
import torch
import csv
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## <span style="color:teal">Read the data and split it into training, cross-validation, and test sets</span>

In [83]:
class Reviews():
    
    def __init__(self):
        self.train = {}
        self.val = {}
        self.test = {}
        self.LABELS = {"positive":1, "negative": 0}
        self.COUNT = {"positive": 0, "negative": 0}
    
    
    def read_data(self):
        
        dataset = []
        
        with open ("IMDB_Dataset.csv", newline='') as f:
            datareader = csv.reader(f, delimiter=',')
            next(datareader, None)
            #row_count = sum(1 for row in datareader)

            for row in datareader:
                dataset.append([row[0], self.LABELS[row[1]]])
                self.COUNT[row[1]] += 1
                
        
        
        random.shuffle(dataset)
            
                
        return dataset




    def split_dataset(self,
                      dataset,
                      split=[int(50000*0.6), int(50000*0.2), int(50000*0.2)]):
        
        result = torch.utils.data.random_split(dataset,
                                               split,
                                               generator=torch.Generator().manual_seed(42))
        #for i in range(len(dataset)):
            #self.data[i] = {'text': dataset[i][0], 'label':dataset[i][1]}  
            
        return result

In [84]:
rev = Reviews()
data = rev.read_data()
pos_count = rev.COUNT["positive"]
neg_count = rev.COUNT["negative"]


In [85]:
train, val, test = rev.split_dataset(data)
print(train, val, test)

<torch.utils.data.dataset.Subset object at 0x7f7214be7070> <torch.utils.data.dataset.Subset object at 0x7f72143b0910> <torch.utils.data.dataset.Subset object at 0x7f72143b0df0>


In [86]:
print(len(train), len(val), len(test))

30000 10000 10000


## <span style="color:teal">Preprocess data</span>

In [76]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import re

In [50]:
def preprocess(review,
               remove_stopwords=False, 
               remove_html=False, 
               remove_punct=False, 
               lowercase=False, 
               lemmatize=False):
    
    review = word_tokenize(review)
        
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        review = [w for w in review if w not in stop_words]
        
    if remove_html:
        review = re.sub(r'<.*>', ' ', review)
        
    if remove_punct:
        review = [w for w in review if w.isalnum()]
    
    if lowercase:
        review = [w.lower() for w in review]
        
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        review = [lemmatizer.lemmatize(w) for w in review]
    
    
    return review
    


## <span style="color:teal">Vectorize the data as a bag-of-words</span>

In [51]:
def make_features_dict(reviews):
    features = {}
    i = 0
    for review in reviews:
        for w in review:
            if w not in features.keys():
                features[w] = i
                i += 1
    return features

In [None]:
def vectorize_review(review, features_dict):
    review_vector = []
    for w in review:
        idx = features_dict[w]
        review_vector += idx
    return review_vector
        