In [1]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
import pandas as pd
import re
from string import punctuation
import nltk
from nltk.corpus import stopwords, words
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
STOPWORDS = set(stopwords.words('english'))
ENGLISH_WORDS = set(words.words())
df_train = pd.read_csv(r"data\Corona_NLP_train.csv", encoding='latin1')
df_test = pd.read_csv(r"data\Corona_NLP_test.csv", encoding='latin1')

print("Size of the train dataset: {}".format(df_train.shape))
print("Size of the test dataset: {}".format(df_test.shape))

Size of the train dataset: (41157, 6)
Size of the test dataset: (3798, 6)


I'm defining preprocessing functions from previous notebook:

In [3]:
def recode_sentiment(y):

    if y in ['Extremely Positive', 'Positive']:
        return 'Positive'
    elif y in ['Extremely Negative', 'Negative']:
        return 'Negative'
    else:
        return 'Neutral'

def remove_url(string):
    return re.sub(r'https?://\S+|www\.\S+', '', string)

def remove_html(string):
    return re.sub(r'<.*?>', '', string)

def remove_numbers(string):
    return re.sub(r'\d+', '', string)

def remove_mentions(string):
    return re.sub(r'@\w+', '', string)

def remove_hashtags(string):
    return re.sub(r'#\w+', '', string)

def clean_data(tweet, return_tokenized=True):
    
    # Tokenization
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)
    
    cleaned_tweet = []
    
    for token, tag in pos_tag(tokens):
        
        # Cleaning tokens with regular expressions
        token = remove_url(token)
        token = remove_html(token)
        token = remove_numbers(token)
        token = remove_mentions(token)
        token = remove_hashtags(token)
        
        # Lemmatizing tokens with part of speech recognition
        
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        
        token = token.lower()
        
        if token not in punctuation and token not in STOPWORDS and token in ENGLISH_WORDS:
            cleaned_tweet.append(token)
    #TfidfVectorizer accepts strings instead of lists of tokens
    if not return_tokenized:
        cleaned_tweet = ' '.join([token for token in cleaned_tweet])

    return cleaned_tweet

In [4]:
df_train['OriginalTweet'], df_test['OriginalTweet'] = \
    df_train['OriginalTweet'].apply(lambda x: clean_data(x, return_tokenized=True)),\
    df_test['OriginalTweet'].apply(lambda x: clean_data(x, return_tokenized=True))

df_train['Sentiment'], df_test['Sentiment'] = \
    df_train['Sentiment'].apply(recode_sentiment), df_test['Sentiment'].apply(recode_sentiment)

In [5]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,[],Neutral
1,3800,48752,UK,16-03-2020,"[advice, talk, family, exchange, phone, number...",Positive
2,3801,48753,Vagabonds,16-03-2020,"[give, elderly, disable, dedicate, shopping, h...",Positive
3,3802,48754,,16-03-2020,"[food, stock, one, empty, please, panic, enoug...",Positive
4,3803,48755,,16-03-2020,"[ready, go, supermarket, outbreak, paranoid, f...",Negative
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,"[news, first, confirm, covid, case, come, coun...",Positive
6,3805,48757,"35.926541,-78.753267",16-03-2020,"[cashier, grocery, store, share, insight, prov...",Positive
7,3806,48758,Austria,16-03-2020,"[supermarket, today, buy, toilet, paper]",Neutral
8,3807,48759,"Atlanta, GA USA",16-03-2020,"[due, covid, retail, store, classroom, open, b...",Positive
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"[corona, prevention, stop, buy, thing, cash, u...",Negative


In [6]:
df_train['TweetLen'], df_test['TweetLen'] = \
    df_train['OriginalTweet'].apply(lambda x: len(x)), df_test['OriginalTweet'].apply(lambda x: len(x))

In [7]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,TweetLen
0,3799,48751,London,16-03-2020,[],Neutral,0
1,3800,48752,UK,16-03-2020,"[advice, talk, family, exchange, phone, number...",Positive,22
2,3801,48753,Vagabonds,16-03-2020,"[give, elderly, disable, dedicate, shopping, h...",Positive,9
3,3802,48754,,16-03-2020,"[food, stock, one, empty, please, panic, enoug...",Positive,15
4,3803,48755,,16-03-2020,"[ready, go, supermarket, outbreak, paranoid, f...",Negative,14
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,"[news, first, confirm, covid, case, come, coun...",Positive,22
6,3805,48757,"35.926541,-78.753267",16-03-2020,"[cashier, grocery, store, share, insight, prov...",Positive,12
7,3806,48758,Austria,16-03-2020,"[supermarket, today, buy, toilet, paper]",Neutral,5
8,3807,48759,"Atlanta, GA USA",16-03-2020,"[due, covid, retail, store, classroom, open, b...",Positive,20
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"[corona, prevention, stop, buy, thing, cash, u...",Negative,19


In [8]:
df_train, df_test = \
    df_train.loc[df_train['TweetLen'] > 0,], df_test.loc[df_test['TweetLen'] > 0,]

In [9]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,TweetLen
1,3800,48752,UK,16-03-2020,"[advice, talk, family, exchange, phone, number...",Positive,22
2,3801,48753,Vagabonds,16-03-2020,"[give, elderly, disable, dedicate, shopping, h...",Positive,9
3,3802,48754,,16-03-2020,"[food, stock, one, empty, please, panic, enoug...",Positive,15
4,3803,48755,,16-03-2020,"[ready, go, supermarket, outbreak, paranoid, f...",Negative,14
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,"[news, first, confirm, covid, case, come, coun...",Positive,22
6,3805,48757,"35.926541,-78.753267",16-03-2020,"[cashier, grocery, store, share, insight, prov...",Positive,12
7,3806,48758,Austria,16-03-2020,"[supermarket, today, buy, toilet, paper]",Neutral,5
8,3807,48759,"Atlanta, GA USA",16-03-2020,"[due, covid, retail, store, classroom, open, b...",Positive,20
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"[corona, prevention, stop, buy, thing, cash, u...",Negative,19
10,3809,48761,"Makati, Manila",16-03-2020,"[month, crowd, supermarket, restaurant, howeve...",Neutral,16


In [10]:
print("Size of the train dataset: {}".format(df_train.shape))
print("Size of the test dataset: {}".format(df_test.shape))

Size of the train dataset: (41052, 7)
Size of the test dataset: (3792, 7)


In [11]:
def create_corpus(data):
    all_words = []
    for x in data:
        for token in x:
            all_words.append(token)
    
    return set(all_words)

In [12]:
vocab = create_corpus(df_train['OriginalTweet'].values)
print('Vocabulary length: {}'.format(len(vocab)))

Vocabulary length: 12962


In [13]:
def create_dictionaries(vocab):

    word_to_int_dict = {w:i+1 for i, w in enumerate(vocab)}
    int_to_word_dict = {i:w for w, i in word_to_int_dict.items()}

    word_to_int_dict[''] = 0
    int_to_word_dict[0] = ''

    return word_to_int_dict, int_to_word_dict

In [14]:
word_to_int_dict, int_to_word_dict = create_dictionaries(vocab)

In [15]:
def pad_sequence(sequence, target_len=25):

    padded_sequence = sequence.copy()

    length = len(padded_sequence)

    if length > target_len:
        padded_sequence = padded_sequence[:target_len]
    elif length < target_len:
        while length < target_len:
            padded_sequence.append('')
            length += 1
    
    return padded_sequence

In [16]:
df_train['TweetLen'].describe()

count    41052.000000
mean        13.420150
std          6.091071
min          1.000000
25%          8.000000
50%         14.000000
75%         18.000000
max         34.000000
Name: TweetLen, dtype: float64

In [17]:
df_train['OriginalTweet'], df_test['OriginalTweet'] = df_train['OriginalTweet'].apply(pad_sequence), df_test['OriginalTweet'].apply(pad_sequence)

In [19]:
df_train['TweetLen'], df_test['TweetLen'] = \
    df_train['OriginalTweet'].apply(lambda x: len(x)), df_test['OriginalTweet'].apply(lambda x: len(x))
df_train['TweetLen'].describe()

count    41052.0
mean        25.0
std          0.0
min         25.0
25%         25.0
50%         25.0
75%         25.0
max         25.0
Name: TweetLen, dtype: float64

In [20]:
def encode_sequence(text):

    encoded_sequence = np.array([word_to_int_dict[word] if word in word_to_int_dict.keys() else word_to_int_dict[''] for word in text])
    return encoded_sequence

In [23]:
print('Original sequence: {}'.format(df_train['OriginalTweet'].iloc[0]))
print()
print('--'*25)
print('Encoded sequence: {}'.format(encode_sequence(df_train['OriginalTweet'].iloc[0])))

Original sequence: ['advice', 'talk', 'family', 'exchange', 'phone', 'number', 'create', 'contact', 'list', 'phone', 'number', 'school', 'employer', 'chemist', 'set', 'shopping', 'account', 'poss', 'adequate', 'supply', 'regular', 'order', '', '', '']

--------------------------------------------------
Encoded sequence: [11504  7963  2636  2163  5358  5926  6490  6069  4950  5358  5926 10242
  8527  3373 12923    46 12543   367  4531  4138  2567  2137     0     0
     0]


In [31]:
y_mapping_dict = {'Negative':0, 'Neutral':1, 'Positive':2}

x_train = []

for x in df_train['OriginalTweet'].values:
    x_train.append(encode_sequence(x))
x_train = np.array(x_train)

y_train = df_train['Sentiment'].map(y_mapping_dict).values

In [37]:
print('x_train shape: {}'.format(x_train.shape))
print('y_train shape: {}'.format(y_train.shape))

x_train shape: (41052, 25)
y_train shape: (41052,)


In [39]:
valid_idx_border = int(df_test.shape[0] / 2)

x_valid = []
x_test = []

for x in df_test['OriginalTweet'].values[:valid_idx_border]:
    x_valid.append(encode_sequence(x))

for x in df_test['OriginalTweet'].values[valid_idx_border:]:
    x_test.append(encode_sequence(x))

x_valid = np.array(x_valid)
x_test = np.array(x_test)

y_valid = df_test['Sentiment'].map(y_mapping_dict).values[:valid_idx_border]
y_test = df_test['Sentiment'].map(y_mapping_dict).values[valid_idx_border:]

In [42]:
print('x_valid shape: {}'.format(x_valid.shape))
print('y_valid shape: {}'.format(y_valid.shape))
print('-'*50)
print('x_test shape: {}'.format(x_test.shape))
print('y_test shape: {}'.format(y_test.shape))

x_valid shape: (1896, 25)
y_valid shape: (1896,)
--------------------------------------------------
x_test shape: (1896, 25)
y_test shape: (1896,)


In [43]:
print(x_valid[-1])
print('-'*50)
print(x_test[0])

[ 1774  9623 10785   995  3233     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]
--------------------------------------------------
[ 1734  6453  7130 10396  8286 10396  3073 12698  4387 10396  4095  3043
 10396  5570     0     0     0     0     0     0     0     0     0     0
     0]


In [47]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)
print(torch.cuda.get_device_properties('cuda:0'))

cuda:0
_CudaDeviceProperties(name='GeForce RTX 2070 SUPER', major=7, minor=5, total_memory=8192MB, multi_processor_count=40)


In [99]:
train_x, train_y = torch.tensor(x_train, device=device).long(), torch.tensor(y_train, device=device).long()
valid_x, valid_y = torch.tensor(x_valid, device=device).long(), torch.tensor(y_valid, device=device).long()
test_x, test_y = torch.tensor(x_test, device=device).long(), torch.tensor(y_test, device=device).long()

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 1

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [100]:
data_iter = iter(train_loader)
sample_x, sample_y = next(data_iter)

print('Sample input: ')
print(sample_x)
print(sample_x.size())
print('Sample label: ')
print(sample_y)
print(sample_y.size())

Sample input: 
tensor([[12948,  4534,  6402,  1641,  1456, 11835, 11759,  9635,  6195, 12687,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]], device='cuda:0')
torch.Size([1, 25])
Sample label: 
tensor([1], device='cuda:0')
torch.Size([1])


In [101]:
class SentimentLSTM(nn.Module):

    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.8):

        super(SentimentLSTM, self).__init__()

        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.n_hidden = n_hidden
        self.n_output = n_output
        self.n_layers = n_layers
        self.drop_p = drop_p

        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):

        embedded_words = self.embedding(x)
        lstm_out, h = self.lstm(embedded_words)
        lstm_out = self.dropout(lstm_out)
        fc_out = self.fc(lstm_out)
        softmax_out = self.softmax(fc_out)
        softmax_out = softmax_out.view(batch_size, -1)
        softmax_last_three = softmax_out[:, -3:]

        return softmax_last_three, h
    
    def init_hidden(self, batch_size):

        device = torch.device('cuda:0')
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size,\
        self.n_hidden).zero_().to(device),\
        weights.new(self.n_layers, batch_size,\
        self.n_hidden).zero_().to(device))
        
        return h

In [102]:
n_vocab = len(word_to_int_dict)
n_embed = 100
n_hidden = 50
n_output = 3
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net.to(device)

SentimentLSTM(
  (embedding): Embedding(12963, 100)
  (lstm): LSTM(100, 50, num_layers=2, batch_first=True, dropout=0.8)
  (dropout): Dropout(p=0.8, inplace=False)
  (fc): Linear(in_features=50, out_features=3, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [103]:
print_every = 500
step = 0
n_epochs = 3
clip = 5 
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(net.parameters(), lr = 0.001)

In [105]:
for epoch in range(n_epochs):
    
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1  
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output, labels)
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []

            for v_inputs, v_labels in valid_loader:
                       
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output, v_labels)
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

  nn.utils.clip_grad_norm(net.parameters(), clip)
Epoch: 1/3 Step: 500 Training Loss: 0.7053 Validation Loss: 1.0346
Epoch: 1/3 Step: 1000 Training Loss: 1.0703 Validation Loss: 1.0208
Epoch: 1/3 Step: 1500 Training Loss: 0.9957 Validation Loss: 1.0091
Epoch: 1/3 Step: 2000 Training Loss: 0.3967 Validation Loss: 0.9939
Epoch: 1/3 Step: 2500 Training Loss: 2.2593 Validation Loss: 0.9781
Epoch: 1/3 Step: 3000 Training Loss: 1.5097 Validation Loss: 0.9621
Epoch: 1/3 Step: 3500 Training Loss: 0.7101 Validation Loss: 0.9673
Epoch: 1/3 Step: 4000 Training Loss: 1.0625 Validation Loss: 0.9778
Epoch: 1/3 Step: 4500 Training Loss: 3.9474 Validation Loss: 0.9677
Epoch: 1/3 Step: 5000 Training Loss: 0.9658 Validation Loss: 0.9664
Epoch: 1/3 Step: 5500 Training Loss: 1.1899 Validation Loss: 0.9711
Epoch: 1/3 Step: 6000 Training Loss: 1.0946 Validation Loss: 0.9776
Epoch: 1/3 Step: 6500 Training Loss: 0.9448 Validation Loss: 0.9807
Epoch: 1/3 Step: 7000 Training Loss: 2.0116 Validation Loss: 0.9778

Next steps will include evaluating the model on test data.