### Imports

In [296]:
from __future__ import print_function, division
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras

from keras.preprocessing import text, sequence
from keras import utils

import torch
from torch.utils.data import Dataset, DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Create plotting function

In [297]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

## Read data

In [298]:
train = pd.read_csv('../input/train.csv')
print('Training data shape: ', train.shape)
test = pd.read_csv('../input/test.csv')
print('Testing data shape: ', test.shape)
print(train[train.keyword.isnull()])

Training data shape:  (7613, 5)
Testing data shape:  (3263, 4)
         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheT

## Preprocessing text

In [299]:
# TODO - clean text
# Applying a first round of text cleaning techniques
import re, string
def clean_text(text):
    eyes = "[8:=;]"
    nose = "['`\-]?"
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',"<URL>", text)    
    text = re.sub("/"," / ", text)
    text = re.sub('@(\w+)', '<USER>', text)
    text = re.sub('#{eyes}#{nose}[)d]+|[)d]+#{nose}#{eyes}', "<SMILE>", text)
    text = re.sub('#{eyes}#{nose}p+', "<LOLFACE>", text)
    text = re.sub('#{eyes}#{nose}\(+|\)+#{nose}#{eyes}', "<SADFACE>", text)
    text = re.sub('#{eyes}#{nose}[\/|l*]', "<NEUTRALFACE>", text)
    text = re.sub('<3',"<HEART>", text)
    text = re.sub('[-+]?[.\d]*[\d]+[:,.\d]*', "<NUMBER>", text)
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    #text = re.sub('\[.*?\]', '', text)
    #text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation.replace('<', '').replace('>', '')), ' ', text)
    text = re.sub('\n', '', text)
    #text = re.sub('\w*\d\w*', '', text)    
    
    return text

import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def text_preprocessing(text):
   
    tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    
    lemmatizer = nltk.stem.WordNetLemmatizer() 
  
    nopunc = clean_text(text)
    
    tokenized_text = tokenizer.tokenize(nopunc)
    
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    
    lemmatized = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(remove_stopwords)]
    
    combined_text = ' '.join(lemmatized)
    return combined_text

print(train.iloc[5680])
print(train)
train['text'] = train['text'].apply(lambda x: text_preprocessing(x))
test['text'] = test['text'].apply(lambda x: text_preprocessing(x))
print(train)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


id                                                   8105
keyword                                           rescued
location                                   The Multiverse
text        But now #Skyrim awaits to be rescued...again.
target                                                  0
Name: 5680, dtype: object
         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfi

### Create embedding matrix

In [300]:
import numpy as np
print(train.iloc[5680])
print(test.iloc[13])
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

id                          8105
keyword                  rescued
location          The Multiverse
text        skyrim awaits rescue
target                         0
Name: 5680, dtype: object
id           43
keyword     NaN
location    NaN
text           
Name: 13, dtype: object


### Retrieve embedding matrix

In [301]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train["text"])
vocab_size = len(tokenizer.word_index) + 1
#print(tokenizer.word_index)
embedding_dim = 50
embedding_matrix = create_embedding_matrix(
        '../input/glove.twitter.27B.50d.txt',
    #'../input/glove.6B.50d.txt',
    tokenizer.word_index, embedding_dim)
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.92158002 -0.054151   -1.00610006 ... -0.6692     -0.49597999
   0.18621001]
 [ 0.48737001  0.16796    -0.41657999 ... -0.65139002 -0.064736
   0.75953001]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.41821    -0.76490003  0.49147999 ... -0.48903999  0.33109999
   0.74254   ]
 [-0.48291001 -0.0029234  -1.59609997 ...  0.44295999 -0.29177001
   0.47444001]]


In [302]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.8542814221331998

In [303]:
print(embedding_matrix.shape)

(11982, 50)


### Tokenize

In [304]:
print(train.iloc[5680])
print(test.iloc[13])

train["text"] = tokenizer.texts_to_sequences(train["text"].values)
test["text"] = tokenizer.texts_to_sequences(test["text"].values)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(vocab_size)

id                          8105
keyword                  rescued
location          The Multiverse
text        skyrim awaits rescue
target                         0
Name: 5680, dtype: object
id           43
keyword     NaN
location    NaN
text           
Name: 13, dtype: object
11982


In [305]:
print(type(train.iloc[0].text))

<class 'list'>


In [306]:
test['text'] = test['text'].apply(lambda x: [0] if x == [] else x)


In [307]:
print(test[train[train.astype(str)['text'] == '[]']])

      id keyword location text
0    NaN     NaN      NaN  NaN
1    NaN     NaN      NaN  NaN
2    NaN     NaN      NaN  NaN
3    NaN     NaN      NaN  NaN
4    NaN     NaN      NaN  NaN
...   ..     ...      ...  ...
3258 NaN     NaN      NaN  NaN
3259 NaN     NaN      NaN  NaN
3260 NaN     NaN      NaN  NaN
3261 NaN     NaN      NaN  NaN
3262 NaN     NaN      NaN  NaN

[3263 rows x 4 columns]


### Convert word ids to vectors

In [308]:
'''
def tokens_to_averaged_vectors(tokens, embedding_matrix):
    vectors = np.asarray([np.asarray(embedding_matrix[token]) for token in tokens])
    return vectors.mean(axis=0)
'''

'\ndef tokens_to_averaged_vectors(tokens, embedding_matrix):\n    vectors = np.asarray([np.asarray(embedding_matrix[token]) for token in tokens])\n    return vectors.mean(axis=0)\n'

In [309]:
'''
train_text = train["text"].apply(lambda x: tokens_to_averaged_vectors(x, embedding_matrix))
test_text = test["text"].apply(lambda x: tokens_to_averaged_vectors(x, embedding_matrix))
'''

train_text = train["text"]
test_text = test["text"]

### Split training and validation data

In [310]:
from sklearn.model_selection import train_test_split

target = train["target"]
train_data, validation_data, train_target, validation_target = train_test_split(
   train_text, target, test_size=0.2, random_state=1000)
test_data = test_text

In [311]:
validation_target.shape, train_target.shape


((1523,), (6090,))

### Define Model

In [312]:
import torch.nn as nn
import torch.nn.functional as F
def create_emb_layer(weights_matrix):
    num_embeddings, embedding_dim = weights_matrix.shape
    #emb_layer = nn.EmbeddingBag(num_embeddings, embedding_dim, sparse=True)
    emb_layer = nn.EmbeddingBag.from_pretrained(weights_matrix, freeze=True) 
    
    #if non_trainable:
        #emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class TwitterClassifier(nn.Module):
    def __init__(self, weights_matrix):
        super().__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix)
        #self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.fc = nn.Linear(embedding_dim, 2)
        self.init_weights()
        
    def forward(self, text):
        embedded = self.embedding(text)
        return self.fc(embedded)

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

   

### Create custom Dataset

In [313]:
# custom dataset
class TwitterDataset(Dataset):
    def __init__(self, texts, labels=None, transforms=None):
        self.X = texts
        if labels is not None:
            self.y = np.asarray(labels)
        else:
            self.y = None
        self.transforms = transforms
         
    def __len__(self):
        return (len(self.X))
    
    def __getitem__(self, i):
        data = self.X.iloc[i]
        data = torch.tensor(data)
        
        if self.transforms:
            data = self.transforms(data)
            
        if self.y is not None:
            return (data, self.y[i])
        else:
            return data
        
print(train_data)
train_data = TwitterDataset(train_data, train_target)
validation_data = TwitterDataset(validation_data, validation_target)
test_data = TwitterDataset(test_data)
print(train_data[0])
print(test_data[0])

6101                              [479, 10681, 1765, 332]
3298    [8321, 617, 2618, 835, 289, 796, 2188, 94, 103...
6817    [3, 306, 134, 257, 110, 134, 1120, 2309, 934, ...
3801                   [8782, 205, 1847, 574, 8, 2327, 1]
7088    [5194, 1415, 1306, 1062, 5195, 1633, 1476, 361...
                              ...                        
7419                  [35, 328, 99, 268, 70, 770, 319, 1]
3776    [8, 108, 8759, 2040, 8760, 407, 8, 1149, 924, ...
6215    [204, 10816, 144, 891, 794, 1052, 2, 10817, 20...
4695    [3548, 14, 196, 1325, 97, 1147, 644, 3548, 326...
1459                              [1061, 141, 84, 872, 1]
Name: text, Length: 6090, dtype: object
(tensor([  479, 10681,  1765,   332]), 0)
tensor([ 222, 1614,   54,   29])


### Create model

In [314]:
print(embedding_matrix.shape)

(11982, 50)


In [315]:
model = TwitterClassifier(torch.tensor(embedding_matrix, dtype=torch.float)).to(device)
#model = TwitterClassifier(torch.tensor(embedding_matrix, dtype=torch.float)).to(device)
model


TwitterClassifier(
  (embedding): EmbeddingBag(11982, 50, mode=mean)
  (fc): Linear(in_features=50, out_features=2, bias=True)
)

In [316]:
def generate_batch(batch):
    label = torch.tensor([entry[1] for entry in batch])
    text = [entry[0] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [317]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    # dataloaders
    data = DataLoader(sub_train_, shuffle=True)
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()
        text, cls = text.to(device), cls.to(device)
        output = model(text)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def validate_func(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_)
    for text, cls in data:
        text, cls = text.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

### Train the model

In [318]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

sub_train_, sub_valid_ = train_data, validation_data

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = validate_func(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 6 seconds
	Loss: 0.5550(train)	|	Acc: 72.5%(train)
	Loss: 0.0009(valid)	|	Acc: 77.8%(valid)
Epoch: 2  | time in 0 minutes, 6 seconds
	Loss: 0.5052(train)	|	Acc: 76.5%(train)
	Loss: 0.0008(valid)	|	Acc: 79.1%(valid)
Epoch: 3  | time in 0 minutes, 6 seconds
	Loss: 0.4985(train)	|	Acc: 76.8%(train)
	Loss: 0.0008(valid)	|	Acc: 78.9%(valid)
Epoch: 4  | time in 0 minutes, 6 seconds
	Loss: 0.4960(train)	|	Acc: 77.1%(train)
	Loss: 0.0006(valid)	|	Acc: 79.6%(valid)
Epoch: 5  | time in 0 minutes, 6 seconds
	Loss: 0.4937(train)	|	Acc: 77.3%(train)
	Loss: 0.0005(valid)	|	Acc: 79.4%(valid)
Epoch: 6  | time in 0 minutes, 6 seconds
	Loss: 0.4931(train)	|	Acc: 77.5%(train)
	Loss: 0.0004(valid)	|	Acc: 78.1%(valid)
Epoch: 7  | time in 0 minutes, 6 seconds
	Loss: 0.4926(train)	|	Acc: 77.5%(train)
	Loss: 0.0008(valid)	|	Acc: 78.0%(valid)
Epoch: 8  | time in 0 minutes, 6 seconds
	Loss: 0.4921(train)	|	Acc: 77.2%(train)
	Loss: 0.0007(valid)	|	Acc: 78.6%(valid)
Epoch: 9  | time

### Predict

In [319]:
def predict_func(test_data_):
    loss = 0
    acc = 0
    predictions = []
    data = DataLoader(test_data_)
    for text in data:
        text = text.to(device)
        with torch.no_grad():
            output = model(text)
            predictions.append(output.argmax(1))
            #acc += (output.argmax(1) == cls).sum().item()

    return predictions

predictions = predict_func(test_data)
print(len(predictions))


tensor([[ 222, 1614,   54,   29]])
tensor([[ 514,  255, 1001,  134,  415, 1709,  189]])
tensor([[ 148,    8,  620, 2854, 2172,  704,  442, 1169,  114]])
tensor([[ 465,  252, 5568,  103]])
tensor([[ 172,  582,   21,    2,  301, 1008]])
tensor([[1776,  255]])
tensor([[ 645,   41,  168,   56, 2210, 1263, 2925, 2925]])
tensor([[593]])
tensor([[1041,  493]])
tensor([[100]])
tensor([[  7, 983]])
tensor([[3161]])
tensor([[238]])
tensor([[0]])
tensor([[873]])
tensor([[3295, 2380,  330,  541,  769,   14,    8,  107, 3295, 2380,  330,    1]])
tensor([[   3, 1244, 1102, 1858,  541]])
tensor([[ 6449,     4,    86, 11312,   687,   149,   867,   394,   541,     1]])
tensor([[ 223,    1, 2382]])
tensor([[4378,    4, 1979, 2872,  295,    3, 9387,  295,    3]])
tensor([[3510,   52,  541, 9488,    9]])
tensor([[ 24,  50, 541,   1]])
tensor([[1686,  395,   15,   36,    2,  118,   94, 1876,  293, 9887,  642,    2,
          149,  212,  541,    3]])
tensor([[2088,  567,  112,  149,  541,    2,   19,   77, 

### Create submission

In [320]:
print(predictions[0].numpy())

[1]


In [321]:
def submission(submission_file_path,submission_data):
    sample_submission = pd.read_csv(submission_file_path)
    sample_submission["target"] = [tensor.numpy()[0] for tensor in submission_data]
    print(sample_submission["target"])
    sample_submission.to_csv("submission.csv", index=False)

In [322]:
submission_file_path = "../input/sample_submission.csv"
submission(submission_file_path,predictions)

0       1
1       1
2       1
3       1
4       1
       ..
3258    1
3259    1
3260    1
3261    1
3262    1
Name: target, Length: 3263, dtype: int64
