In [1]:
import pandas as pd
from string import punctuation
import numpy as np
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
import json

In [2]:
# with open("sentiment labelled sentences/sentiment.txt") as f:
#     reviews = f.read()
    
# data = pd.DataFrame([review.split('\t') for review in reviews.split('\n')])

# data.columns = ['Review','Sentiment']

# data = data.sample(frac=1)

In [3]:
data= pd.read_csv('financial.csv', encoding = "ISO-8859-1")

In [4]:
data.head()

Unnamed: 0,Label,Comment
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company 's updated strategy f...


In [5]:
def split_words_reviews(data):
    text = list(data['Comment'].values)
    clean_text = []
    for t in text:
        clean_text.append(t.translate(str.maketrans('', '', punctuation)).lower().rstrip())
    tokenized = [word_tokenize(x) for x in clean_text]
    all_text = []
    for tokens in tokenized:
        for t in tokens:
            all_text.append(t)
    return tokenized, set(all_text)

reviews, vocab = split_words_reviews(data)

reviews[0]

['according',
 'to',
 'gran',
 'the',
 'company',
 'has',
 'no',
 'plans',
 'to',
 'move',
 'all',
 'production',
 'to',
 'russia',
 'although',
 'that',
 'is',
 'where',
 'the',
 'company',
 'is',
 'growing']

In [6]:
def create_dictionaries(words):
    word_to_int_dict = {w:i+1 for i, w in enumerate(words)}
    int_to_word_dict = {i:w for w, i in word_to_int_dict.items()}
    return word_to_int_dict, int_to_word_dict

word_to_int_dict, int_to_word_dict = create_dictionaries(vocab)

int_to_word_dict

{1: 'combination',
 2: 'use',
 3: 'got',
 4: 'eur111',
 5: 'sofa',
 6: 'drinking',
 7: 'bark',
 8: 'updated',
 9: 'grodno',
 10: 'telpak',
 11: 'highrises',
 12: 'identification',
 13: 'berling',
 14: 'weaknesses',
 15: 'chains',
 16: 'learns',
 17: '3931',
 18: 'definitive',
 19: 'initiatives',
 20: 'giant',
 21: 'reliable',
 22: 'favourably',
 23: 'getinge',
 24: 'reaching',
 25: 'targeted',
 26: 'such',
 27: 'leminen',
 28: 'decisionmaking',
 29: 'assessing',
 30: '42',
 31: 'pedestrian',
 32: '360yearold',
 33: '1437',
 34: 'notified',
 35: '04012006',
 36: 'supplier',
 37: 'sentera',
 38: 'online',
 39: '527',
 40: 'ystok',
 41: 'pure',
 42: 'worldleading',
 43: 'hydrocopper',
 44: 'sustainability',
 45: 'gather',
 46: '2003',
 47: '296',
 48: 'painful',
 49: 'funds',
 50: 'leed',
 51: 'movements',
 52: 'unstable',
 53: 'repayment',
 54: 'stylised',
 55: '48012',
 56: 'associations',
 57: 'shall',
 58: 'process',
 59: 'motorcyclist',
 60: 'cooperation',
 61: 'impacts',
 62: 'later

In [7]:
with open('word_to_int_dict_financial.json', 'w') as fp:
    json.dump(word_to_int_dict, fp)

In [8]:
print(np.max([len(x) for x in reviews]))
print(np.mean([len(x) for x in reviews]))

52
20.49422203879488


In [9]:
def pad_text(tokenized_reviews, seq_length):
    
    reviews = []
    
    for review in tokenized_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append(['']*(seq_length-len(review)) + review)
        
    return np.array(reviews)

padded_sentences = pad_text(reviews, seq_length = 50)

padded_sentences[0]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', 'according', 'to',
       'gran', 'the', 'company', 'has', 'no', 'plans', 'to', 'move',
       'all', 'production', 'to', 'russia', 'although', 'that', 'is',
       'where', 'the', 'company', 'is', 'growing'], dtype='<U25')

In [10]:
int_to_word_dict[0] = ''
word_to_int_dict[''] = 0

In [11]:
encoded_sentences = np.array([[word_to_int_dict[word] for word in review] for review in padded_sentences])

encoded_sentences[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,  9302,  3832,  6230,  8349,  9130,  1526, 10779,  6582,
        3832,  8453,  9223,  3805,  3832,   894,  5309,  6047,  7924,
        2516,  8349,  9130,  7924,  1731])

In [12]:
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.8):
        super().__init__()
        
        self.n_vocab = n_vocab  
        self.n_layers = n_layers 
        self.n_hidden = n_hidden 
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                          
        embedded_words = self.embedding(input_words)
        lstm_out, h = self.lstm(embedded_words) 
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden)
        fc_out = self.fc(lstm_out)                  
        sigmoid_out = self.sigmoid(fc_out)              
        sigmoid_out = sigmoid_out.view(batch_size, -1)  
        
        sigmoid_last = sigmoid_out[:, -1]
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):
        
        device = "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h


In [13]:
n_vocab = len(word_to_int_dict)
n_embed = 50
n_hidden = 100
n_output = 1
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

In [14]:
# def label_encoding(data):
#     labels= np.empty([data.shape[0]])
    
#     for i in range(data.shape[0]):
#         if data['Label'][i]== 'negative':
#             labels[i]= 0
#         elif data['Label'][i]== 'neutral':
#             labels[i]= 1
#         else:
#             labels[i]= 2
            
#     return labels

# labels= label_encoding(data)

# labels.to_csv('labels.csv')

In [16]:
# labels_arr= np.array(labels)
# type(labels_arr)

In [17]:
labels = np.array([int(x) for x in data['Label'].values])
# labels= label_encoding(data)

train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2

total = len(encoded_sentences)
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = torch.Tensor(encoded_sentences[:train_cutoff]).long(), torch.Tensor(labels[:train_cutoff]).long()
valid_x, valid_y = torch.Tensor(encoded_sentences[train_cutoff : valid_cutoff]).long(), torch.Tensor(labels[train_cutoff : valid_cutoff]).long()
test_x, test_y = torch.Tensor(encoded_sentences[valid_cutoff:]).long(), torch.Tensor(labels[valid_cutoff:])

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 1

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [18]:
print_every = 2400
step = 0
n_epochs = 3
clip = 5  
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

In [24]:
for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    labels= labels.unsqueeze(1)
    
    for inputs, labels in train_loader:
        step += 1  
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss = criterion(output.squeeze(), labels())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []

            for v_inputs, v_labels in valid_loader:
                       
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

ValueError: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])) is deprecated. Please ensure they have the same size.

In [25]:
for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1  
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.squeeze().float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []

            for v_inputs, v_labels in valid_loader:
                       
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.squeeze().float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

  nn.utils.clip_grad_norm(net.parameters(), clip)


Epoch: 1/3 Step: 2400 Training Loss: 0.0000 Validation Loss: 26.5979
Epoch: 2/3 Step: 4800 Training Loss: 0.0000 Validation Loss: 26.5979
Epoch: 2/3 Step: 7200 Training Loss: 0.0000 Validation Loss: 26.5979
Epoch: 3/3 Step: 9600 Training Loss: 0.0000 Validation Loss: 26.5979


In [26]:
torch.save(net.state_dict(), 'model_fin.pkl')

In [35]:
type(v_output)


torch.Tensor

In [36]:
net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net.load_state_dict(torch.load('model_fin.pkl'))

<All keys matched successfully>

In [37]:
net.eval()
test_losses = []
num_correct = 0

for inputs, labels in test_loader:

    test_output, test_h = net(inputs)
    loss = criterion(test_output, labels)
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))    

Test Loss: 58.5567
Test Accuracy: 0.20


In [38]:
def preprocess_review(review):
    review = review.translate(str.maketrans('', '', punctuation)).lower().rstrip()
    tokenized = word_tokenize(review)
    if len(tokenized) >= 50:
        review = tokenized[:50]
    else:
        review= ['0']*(50-len(tokenized)) + tokenized
    
    final = []
    
    for token in review:
        try:
            final.append(word_to_int_dict[token])
            
        except:
            final.append(word_to_int_dict[''])
        
    return final

In [39]:
def predict(review):
    net.eval()
    words = np.array([preprocess_review(review)])
    padded_words = torch.from_numpy(words)
    pred_loader = DataLoader(padded_words, batch_size = 1, shuffle = True)
    for x in pred_loader:
        output = net(x)[0].item()
    
    msg = "This is a positive review." if output >= 0.5 else "This is a negative review."
    print(msg)
    print('Prediction = ' + str(output))

In [44]:
predict("Tesla is doing very bad. But elon seems to be worried with the war. Lets see what happens")

This is a positive review.
Prediction = 1.0


In [45]:
predict("It was not good")

This is a positive review.
Prediction = 1.0
