In [1]:
from dataset.legacy.tweets_dataset import TweetsDataset
dataset = TweetsDataset(transform=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
size_of_vocab = len(dataset.Tweet.vocab)
embedding_dim = 300
num_hidden_nodes = 20

In [3]:
from model.lstm import Lstm
from model.encoder_decoder import EncoderDecoder
encoder = Lstm(vocab_size = size_of_vocab, embedding_dim= embedding_dim, hidden_dim = num_hidden_nodes, staggered_input = True)
decoder = Lstm(vocab_size = None, embedding_dim= encoder.fc.out_features, hidden_dim = num_hidden_nodes, output_dim = num_hidden_nodes, staggered_input = False)

In [4]:
model = EncoderDecoder(encoder=encoder, decoder=decoder, num_classes=3)

In [5]:
#load weights and tokenizer
import torch
import os, pickle

path='./saved_weights.pt'
model.load_state_dict(torch.load(path))
model.eval()
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)
device = "cpu"
#inference 

import spacy
nlp = spacy.load('en')
misclassified = []
correct = []
categories = {0: "Negative", 1:"Positive", 2:"Neutral"}

In [6]:
def classify_tweet(tweet):
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1)
    
    return categories[pred.item()]

In [7]:
import pandas as pd
dataframe = pd.read_csv("data/tweets.csv").head(50)

In [8]:
from utils.augmentations import processTweet
dataframe["labels"] = dataframe["labels"].apply(lambda label: categories[label])
dataframe["cleaned tweets"] = dataframe["tweets"].apply(lambda tweet: processTweet(tweet))
dataframe["prediction"] = dataframe["tweets"].apply(lambda tweet: classify_tweet(tweet))

In [9]:
correct_classified_dataframe = dataframe[dataframe['prediction'] == dataframe['labels']]

In [10]:
misclassified_dataframe = dataframe[dataframe['prediction'] != dataframe['labels']]

In [11]:
for index, row in correct_classified_dataframe.head(25).iterrows():
    print("\n\n----")
    print(f"Tweet : {row['tweets']}\n\nCleaned tweet : {row['cleaned tweets']}\n\nPrediction: {row['prediction']}\tActual: {row['labels']}")



----
Tweet : Obama has called the GOP budget social Darwinism. Nice try, but they believe in social creationism.

Cleaned tweet : obama has called the gop budget social darwinism nice try but they believe in social creationism

Prediction: Positive	Actual: Positive


----
Tweet : In his teen years, Obama has been known to use marijuana and cocaine.

Cleaned tweet : in his teen years obama has been known to use marijuana and cocaine

Prediction: Negative	Actual: Negative


----
Tweet : IPA Congratulates President Barack Obama for Leadership Regarding JOBS Act: WASHINGTON, Apr 05, 2012 (BUSINESS W... http://t.co/8le3DC8E

Cleaned tweet : ipa congratulates president barack obama for leadership regarding jobs act washington apr 05 2012 business w URL

Prediction: Negative	Actual: Negative


----
Tweet : RT @Professor_Why: #WhatsRomneyHiding - his connection to supporters of Critical Race Theory.... Oh wait, that was Obama, not Romney...

Cleaned tweet : rt AT USER whatsromneyhiding his c

In [12]:
for index, row in misclassified_dataframe.head(25).iterrows():
    print("\n\n----")
    print(f"Tweet : {row['tweets']}\n\nCleaned tweet : {row['cleaned tweets']}\n\nPrediction: {row['prediction']}\tActual: {row['labels']}")



----
Tweet : RT @wardollarshome: Obama has approved more targeted assassinations than any modern US prez; READ & RT: http://t.co/bfC4gbBW

Cleaned tweet : rt AT USER obama has approved more targeted assassinations than any modern us prez read rt URL

Prediction: Negative	Actual: Positive


----
Tweet : one Chicago kid who says "Obama is my man" tells Jesse Watters that the gun violence in Chicago is like "World War 17"

Cleaned tweet : one chicago kid who says obama is my man tells jesse watters that the gun violence in chicago is like world war 17

Prediction: Positive	Actual: Negative


----
Tweet : #WhatsRomneyHiding? Obama's dignity and sense of humor? #p2 #tcot

Cleaned tweet : whatsromneyhiding obamas dignity and sense of humor p2 tcot

Prediction: Negative	Actual: Neutral


----
Tweet : Here's How Obama and the Democrats Will Win in 2012: Let's start by going back to the assorted polls questioning... http://t.co/zpg0TVm3

Cleaned tweet : heres how obama and the democrats will 

# The actual assignment

In [13]:
from model.lstm import Lstm
from model.encoder_decoder import EncoderDecoder
encoder = Lstm(vocab_size = size_of_vocab, embedding_dim= embedding_dim, hidden_dim = num_hidden_nodes, staggered_input = True, debug=True)
decoder = Lstm(vocab_size = None, embedding_dim= encoder.fc.out_features, hidden_dim = num_hidden_nodes, output_dim = num_hidden_nodes, staggered_input = False, debug=True)

In [14]:
model = EncoderDecoder(encoder=encoder, decoder=decoder, num_classes=3)

In [15]:
import torch
path='./saved_weights.pt'
model.load_state_dict(torch.load(path))
model.eval()

EncoderDecoder(
  (encoder): Lstm(
    (embedding): Embedding(3098, 300)
    (lstm_cell): LSTMCell(300, 20, bias=False)
    (fc): Linear(in_features=20, out_features=16, bias=True)
  )
  (decoder): Lstm(
    (lstm_cell): LSTMCell(16, 20, bias=False)
    (fc): Linear(in_features=20, out_features=20, bias=True)
  )
  (linear_layer): Linear(in_features=20, out_features=3, bias=True)
)

In [16]:
classify_tweet("Obama is a very very bad person")

Encoder 0
tensor([[-0.3326, -0.0291,  0.0139,  0.1516,  0.0039, -0.0206, -0.4432,  0.0555,
          0.4517,  0.5568, -0.6531,  0.2773,  0.0017, -0.2519,  0.0504,  0.0926,
          0.5178, -0.6054,  0.1508, -0.1905]], grad_fn=<MulBackward0>)
Encoder 1
tensor([[-0.4290, -0.5466,  0.2733, -0.0392,  0.2070,  0.3221,  0.0357, -0.0811,
          0.5935,  0.0762, -0.2338,  0.0524,  0.0108, -0.0077,  0.3277, -0.1900,
          0.0007, -0.4905, -0.0241,  0.1239]], grad_fn=<MulBackward0>)
Encoder 2
tensor([[-0.0246,  0.1054, -0.0060,  0.0719,  0.0039, -0.1117, -0.0065,  0.6118,
          0.0457, -0.0303, -0.1856, -0.1946, -0.2983,  0.4614,  0.0203, -0.0078,
         -0.3902, -0.5649,  0.0283,  0.6676]], grad_fn=<MulBackward0>)
Encoder 3
tensor([[-3.2853e-01, -4.2382e-03, -2.5963e-03,  3.7073e-02,  5.3789e-01,
         -3.1684e-02, -2.0090e-03,  1.6923e-01,  2.6905e-01, -5.0071e-01,
          9.9603e-02, -3.4340e-01, -9.9730e-02,  6.4148e-02,  1.3340e-01,
          2.1384e-01,  4.2437e-05,  3.4

'Negative'