In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn,optim
import torch.nn.functional as F

## Loading The Dataset

In [2]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [3]:
len(reviews)

25000

In [4]:
len(labels)

25000

## Creating a Vocabulary

In [5]:
vocab = set()
for review in reviews:
    for word in review.split(" "):
        vocab.add(word)

In [6]:
len(vocab)

74074

### Creating a dictionary for converting the words into numbers

In [7]:
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

In [8]:
word2index['']

0

## Creating the Input vector

In [None]:
def get_input_vector(review):
    x = np.zeros((1,len(vocab)))
    for word in review.split(" "):
        x[0][word2index[word]]+=1
    return x

### Converting Labels into numbers
    - 0 for negative
    - 1 for positive

In [9]:
y = np.zeros((len(labels),1))

In [10]:
for i in range(len(labels)):
    if labels[i].upper() == "POSITIVE":
        y[i][0] = 1

## Defining the Model

In [None]:
class SentimentNet(nn.Module):
    
    def __init__(self, vocab_size):
        super(SentimentNet, self).__init__()
        
        self.fc1 = nn.Linear(vocab_size,1000)
        self.fc2 = nn.Linear(1000,1)
    
    def forward(self,review):
        x = get_input_vector(review)
        x = torch.from_numpy(x)
        x = F.relu(self.fc1(x.float()))
        x = F.sigmoid(self.fc2(x))
        return x

In [None]:
model = SentimentNet(len(vocab))

In [None]:
loss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)

In [None]:
train_reviews = reviews[:-1000]
test_reviews  = reviews[-1000:]

In [None]:
len(test_reviews)

1000

In [None]:
n_epochs = 1

for epoch in range(n_epochs):
    epoch_loss = 0.0
    correct = 0
    total = 0
    
    for i,review in enumerate(train_reviews[:100]):
        
        model.zero_grad()
        
        prob = model(review)
        
        total+=1
        
        if( ( (prob>0.5) & (y[i]==1) ) | ( (prob<=0.5) & (y[i]==0) ) ):
            correct+=1
            
        loss1 = loss(prob,torch.tensor(y[i]).float())
        
        epoch_loss += loss1.item()
        
        loss1.backward()
        
        optimizer.step()
        
        if i % 10 == 9: 
            accuracy = 100*correct/total
            print('Epoch: {}, Review: {}, Accuracy: {}'.format(epoch + 1, i+1, accuracy))
            
print('Finished Training')

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch: 1, Review: 10, Accuracy: 40.0
Epoch: 1, Review: 20, Accuracy: 40.0
Epoch: 1, Review: 30, Accuracy: 43.333333333333336
Epoch: 1, Review: 40, Accuracy: 45.0
Epoch: 1, Review: 50, Accuracy: 46.0
Epoch: 1, Review: 60, Accuracy: 46.666666666666664
Epoch: 1, Review: 70, Accuracy: 50.0
Epoch: 1, Review: 80, Accuracy: 53.75
Epoch: 1, Review: 90, Accuracy: 56.666666666666664
Epoch: 1, Review: 100, Accuracy: 61.0
Finished Training


In [None]:
model.eval()
correct = 0
total = 0
    
for i,review in enumerate(test_reviews[:200]):
   
    prob = model(review)
        
    total+=1
        
    if( ( (prob>0.5) & (y[i+24000]==1) ) | ( (prob<=0.5) & (y[i+24000]==0) ) ):
        correct+=1
            
        
    if i % 100 == 99: 
        accuracy = 100*correct/total
        print('Epoch: {}, Review: {}, Test Accuracy: {}'.format(epoch + 1, i+1, accuracy))

Epoch: 1, Review: 100, Test Accuracy: 65.0
Epoch: 1, Review: 200, Test Accuracy: 66.0


## Approach 2:

In [11]:
def get_input_vector(review):
    x = np.zeros((1,len(vocab)))
    for word in review.split(" "):
        x[0][word2index[word]] = 1
    return x

In [None]:
import torch.nn.functional as F

In [None]:
class SentimentNet2(nn.Module):
    
    def __init__(self, vocab_size,embedding_dim):
        super(SentimentNet2, self).__init__()
        
        #self.embd = nn.Embedding(vocab_size,embedding_dim)
        self.fc1 = nn.Linear(vocab_size,256)
        self.fc2 = nn.Linear(256,1)
    
    def forward(self,review):
        
        x = get_input_vector(review)
        x = torch.from_numpy(x)
        #x = self.embd(x.long())
        x = F.relu(self.fc1(x.float()))
        x = self.fc2(x)
        x = F.sigmoid(x)
        
        return x

In [None]:
model2 = SentimentNet2(len(vocab),512)

In [None]:
train_reviews = reviews[:1000]

In [None]:
len(train_reviews)

1000

In [None]:
loss = nn.BCELoss()
optimizer = optim.Adam(model2.parameters(), lr = 0.01)

In [None]:
len(reviews[0])

832

In [None]:
reviews[0]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   '

In [None]:
n_epochs = 1

for epoch in range(n_epochs):
    epoch_loss = 0.0
    correct = 0
    total = 0
    
    for i,review in enumerate(train_reviews):
        
        model2.zero_grad()
        
        prob = model2(review)
        
        total+=1
        
        if( ( (prob>0.5) & (y[i]==1) ) | ( (prob<=0.5) & (y[i]==0) ) ):
            correct+=1
            
        target = torch.tensor(y[i])
        
        target = target.view(1,1)
            
        loss1 = loss(prob,target.float())
        
        epoch_loss += loss1.item()
        
        loss1.backward()
        
        optimizer.step()
        
        if i % 50 == 49: 
            accuracy = 100*correct/total
            print('Epoch: {}, Review: {}, Accuracy: {}'.format(epoch + 1, i+1, accuracy))
            
print('Finished Training')

Epoch: 1, Review: 50, Accuracy: 86.0
Epoch: 1, Review: 100, Accuracy: 88.0
Epoch: 1, Review: 150, Accuracy: 87.33333333333333
Epoch: 1, Review: 200, Accuracy: 83.5
Epoch: 1, Review: 250, Accuracy: 84.4
Epoch: 1, Review: 300, Accuracy: 82.33333333333333
Epoch: 1, Review: 350, Accuracy: 82.85714285714286
Epoch: 1, Review: 400, Accuracy: 82.5
Epoch: 1, Review: 450, Accuracy: 82.44444444444444
Epoch: 1, Review: 500, Accuracy: 83.0
Epoch: 1, Review: 550, Accuracy: 81.63636363636364
Epoch: 1, Review: 600, Accuracy: 81.83333333333333
Epoch: 1, Review: 650, Accuracy: 82.0
Epoch: 1, Review: 700, Accuracy: 82.28571428571429
Epoch: 1, Review: 750, Accuracy: 82.0
Epoch: 1, Review: 800, Accuracy: 81.375
Epoch: 1, Review: 850, Accuracy: 82.23529411764706
Epoch: 1, Review: 900, Accuracy: 82.0
Epoch: 1, Review: 950, Accuracy: 82.10526315789474
Epoch: 1, Review: 1000, Accuracy: 82.2
Finished Training


In [None]:
test_reviews = reviews[-1000:]

In [None]:
len(test_reviews)

1000

In [None]:
model2.eval()
correct = 0
total = 0
    
for i,review in enumerate(test_reviews):
   
    prob = model2(review)
        
    total+=1
        
    if( ( (prob>0.5) & (y[i+24000]==1) ) | ( (prob<=0.5) & (y[i+24000]==0) ) ):
        correct+=1
            
        
    if i % 100 == 99: 
        accuracy = 100*correct/total
        print('Epoch: {}, Review: {}, Test Accuracy: {}'.format(epoch + 1, i+1, accuracy))

Epoch: 1, Review: 100, Test Accuracy: 75.0
Epoch: 1, Review: 200, Test Accuracy: 74.0
Epoch: 1, Review: 300, Test Accuracy: 75.33333333333333
Epoch: 1, Review: 400, Test Accuracy: 75.25
Epoch: 1, Review: 500, Test Accuracy: 76.6
Epoch: 1, Review: 600, Test Accuracy: 76.0
Epoch: 1, Review: 700, Test Accuracy: 74.85714285714286
Epoch: 1, Review: 800, Test Accuracy: 73.875
Epoch: 1, Review: 900, Test Accuracy: 73.0
Epoch: 1, Review: 1000, Test Accuracy: 73.6


## Approach 3

### Using mini_batches and training on GPU

In [12]:
class SentimentNet3(nn.Module):
    
    def __init__(self, vocab_size):
        super(SentimentNet3, self).__init__()
        
        self.fc1 = nn.Linear(vocab_size,1000)
        self.fc2 = nn.Linear(1000,1)
    
    def forward(self,x):
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = F.sigmoid(x)
        
        return x

In [13]:
net = SentimentNet3(len(vocab))

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


In [15]:
net.to(device)

SentimentNet3(
  (fc1): Linear(in_features=74074, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=1, bias=True)
)

In [16]:
train_reviews = reviews[:-1000]

In [17]:
len(train_reviews)

24000

In [18]:
test_reviews = reviews[-1000:]

In [19]:
len(test_reviews)

1000

In [20]:
loss = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.01)

In [22]:
n_epochs = 1

for epoch in range(n_epochs):
    epoch_loss = 0.0
    correct = 0
    total = 0
    
    for i,review in enumerate(train_reviews):
        
        net.zero_grad()
        
        x = get_input_vector(review)
        
        x = torch.from_numpy(x)
        
        x = x.to(device)
        
        prob = net(x.float())
        
        total+=1
        
        if( ( (prob.cpu()>0.5) & (y[i]==1) ) | ( (prob.cpu()<=0.5) & (y[i]==0) ) ):
            correct+=1
            
        target = torch.tensor(y[i])
        
        target = target.view(1,1)
            
        target = target.to(device)
            
        loss1 = loss(prob,target.float())
        
        epoch_loss += loss1.item()
        
        loss1.backward()
        
        optimizer.step()
        
        if i % 100 == 99: 
            accuracy = 100*correct/total
            print('Epoch: {}, Review: {}, Accuracy: {}'.format(epoch + 1, i+1, accuracy))
            
print('Finished Training')



Epoch: 1, Review: 100, Accuracy: 93.0
Epoch: 1, Review: 200, Accuracy: 95.0
Epoch: 1, Review: 300, Accuracy: 89.66666666666667
Epoch: 1, Review: 400, Accuracy: 88.25
Epoch: 1, Review: 500, Accuracy: 86.2
Epoch: 1, Review: 600, Accuracy: 84.33333333333333
Epoch: 1, Review: 700, Accuracy: 84.57142857142857
Epoch: 1, Review: 800, Accuracy: 84.25
Epoch: 1, Review: 900, Accuracy: 84.55555555555556
Epoch: 1, Review: 1000, Accuracy: 84.5
Epoch: 1, Review: 1100, Accuracy: 84.54545454545455
Epoch: 1, Review: 1200, Accuracy: 84.66666666666667
Epoch: 1, Review: 1300, Accuracy: 84.53846153846153
Epoch: 1, Review: 1400, Accuracy: 84.14285714285714
Epoch: 1, Review: 1500, Accuracy: 83.46666666666667
Epoch: 1, Review: 1600, Accuracy: 83.3125
Epoch: 1, Review: 1700, Accuracy: 82.94117647058823
Epoch: 1, Review: 1800, Accuracy: 82.94444444444444
Epoch: 1, Review: 1900, Accuracy: 82.89473684210526
Epoch: 1, Review: 2000, Accuracy: 83.2
Epoch: 1, Review: 2100, Accuracy: 83.42857142857143
Epoch: 1, Review

In [23]:
net.eval()
correct = 0
total = 0
    
for i,review in enumerate(test_reviews):
   
    x = get_input_vector(review)
        
    x = torch.from_numpy(x)
        
    x = x.to(device)
        
    prob = net(x.float())
        
    total+=1
        
    if( ( (prob.cpu()>0.5) & (y[i+24000]==1) ) | ( (prob.cpu()<=0.5) & (y[i+24000]==0) ) ):
        correct+=1
            
        
    if i % 1000 == 999: 
        accuracy = 100*correct/total
        print('Epoch: {}, Review: {}, Test Accuracy: {}'.format(epoch + 1, i+1, accuracy))



Epoch: 1, Review: 1000, Test Accuracy: 84.2


**84.2% Accuracy on test data**