### Imports

In [26]:
from __future__ import print_function, division
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras

from keras.preprocessing import text, sequence
from keras import utils

import torch
from torch.utils.data import Dataset, DataLoader
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Create plotting function

In [27]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

## Read data

In [28]:
train = pd.read_csv('../input/train.csv')
print('Training data shape: ', train.shape)
test = pd.read_csv('../input/test.csv')
print('Testing data shape: ', test.shape)
print(train[train.keyword.isnull()])

Training data shape:  (7613, 5)
Testing data shape:  (3263, 4)
         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheT

## Preprocessing text

In [29]:
# TODO - clean text
# Applying a first round of text cleaning techniques
import re, string
def clean_text(text):
    eyes = "[8:=;]"
    nose = "['`\-]?"
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',"<URL>", text)    
    text = re.sub("/"," / ", text)
    text = re.sub('@(\w+)', '<USER>', text)
    text = re.sub('#{eyes}#{nose}[)d]+|[)d]+#{nose}#{eyes}', "<SMILE>", text)
    text = re.sub('#{eyes}#{nose}p+', "<LOLFACE>", text)
    text = re.sub('#{eyes}#{nose}\(+|\)+#{nose}#{eyes}', "<SADFACE>", text)
    text = re.sub('#{eyes}#{nose}[\/|l*]', "<NEUTRALFACE>", text)
    text = re.sub('<3',"<HEART>", text)
    text = re.sub('[-+]?[.\d]*[\d]+[:,.\d]*', "<NUMBER>", text)
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    #text = re.sub('\[.*?\]', '', text)
    #text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation.replace('<', '').replace('>', '')), '', text)
    text = re.sub('\n', '', text)
    #text = re.sub('\w*\d\w*', '', text)    
    
    return text

import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def text_preprocessing(text):
   
    tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    
    lemmatizer = nltk.stem.WordNetLemmatizer() 
  
    nopunc = clean_text(text)
    
    tokenized_text = tokenizer.tokenize(nopunc)
    
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    
    lemmatized = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(remove_stopwords)]
    
    combined_text = ' '.join(lemmatized)
    return combined_text

print(train)
train['text'] = train['text'].apply(lambda x: text_preprocessing(x))
test['text'] = test['text'].apply(lambda x: text_preprocessing(x))
print(train)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...       1  
7610  M1.94 [01:04 UT

### Create embedding matrix

In [30]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

### Retrieve embedding matrix

In [31]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train["text"])
vocab_size = len(tokenizer.word_index) + 1
#print(tokenizer.word_index)
embedding_dim = 50
embedding_matrix = create_embedding_matrix(
        '../input/glove.twitter.27B.50d.txt',
    #'../input/glove.6B.50d.txt',
    tokenizer.word_index, embedding_dim)
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.92158002 -0.054151   -1.00610006 ... -0.6692     -0.49597999
   0.18621001]
 [ 0.48737001  0.16796    -0.41657999 ... -0.65139002 -0.064736
   0.75953001]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.41821    -0.76490003  0.49147999 ... -0.48903999  0.33109999
   0.74254   ]
 [-0.48291001 -0.0029234  -1.59609997 ...  0.44295999 -0.29177001
   0.47444001]]


In [32]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.8111826930724568

In [33]:
print(embedding_matrix.shape)

(12573, 50)


### Tokenize

In [34]:
train["text"] = tokenizer.texts_to_sequences(train["text"].values)
test["text"] = tokenizer.texts_to_sequences(test["text"].values)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(train.head())
print(vocab_size)
#print(tokenizer.word_index)

   id keyword location                                               text  \
0   1     NaN      NaN               [3622, 462, 249, 82, 1328, 2842, 12]   
1   4     NaN      NaN                           [145, 8, 184, 463, 1093]   
2   5     NaN      NaN  [1460, 513, 1591, 402, 328, 202, 1591, 402, 30...   
3   6     NaN      NaN                   [2, 16, 2843, 101, 202, 307, 47]   
4   7     NaN      NaN         [6, 2361, 139, 1592, 198, 3623, 2362, 125]   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
12573


### Pad sequences

In [35]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 100
#train_text = pad_sequences(train["text"].values, padding='post', maxlen=maxlen)
#test_text = pad_sequences(test["text"].values, padding='post', maxlen=maxlen)
target = train["target"].values

In [36]:
#print(train_text)

### Split training and validation data

In [37]:
from sklearn.model_selection import train_test_split
train_text = train["text"]
test_text = test["text"]
target = train["target"]
train_data, validation_data, train_target, validation_target = train_test_split(
   train_text, target, test_size=0.2, random_state=1000)


In [38]:
validation_target.shape


(1523,)

In [39]:
import bcolz, pickle
glove_path = "../input"
words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/6B.50.dat', mode='w')

with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
vectors = bcolz.carray(vectors[1:].reshape((-1, 50)), rootdir=f'{glove_path}/6B.50.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb'))

In [40]:
vectors = bcolz.open(f'{glove_path}/6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [41]:
print(glove['fire']-glove['burn'])

[ 0.40047  -0.270561 -0.56429   0.79243   0.170498 -0.091421 -0.1837
  0.41076  -0.47856  -1.35079  -0.349376 -0.5061   -0.62094   0.01979
 -0.62191  -0.712412 -0.650148  0.522501 -0.90197  -0.02485   0.223041
  0.86639  -0.63049  -0.15719  -0.32965  -0.8433    0.50153   0.24625
  0.0133    0.45564   1.3535   -0.595486 -0.48492  -0.25933   0.342529
  0.0483   -0.04607  -0.72569  -0.64634   0.50573  -0.1793    0.089243
  0.22474  -0.38764  -0.18515  -0.40129  -0.389448  0.67824   0.417432
 -0.15132 ]


In [42]:
target_vocab = tokenizer.word_index
matrix_len = len(target_vocab)
emb_dim = 50
weights_matrix = np.zeros((matrix_len, 50))
words_found = 0

for i, word in enumerate(target_vocab):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
print(words_found/matrix_len)
print(matrix_len)

0.7714762965319758
12572


### Define Model

In [43]:
import torch.nn as nn
import torch.nn.functional as F
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.EmbeddingBag(num_embeddings, embedding_dim, sparse=True)
    #emb_layer = nn.EmbeddingBag.from_pretrained(weights_matrix) 
    
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class TwitterClassifier(nn.Module):
    def __init__(self, weights_matrix):
        super().__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix)
        #self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.fc = nn.Linear(embedding_dim, 2)
        self.init_weights()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text,offsets)
        return self.fc(embedded)

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

   

### Create custom Dataset

In [44]:
# custom dataset
class TwitterDataset(Dataset):
    def __init__(self, texts, labels=None, transforms=None):
        self.X = texts
        self.y = np.asarray(labels)
        self.transforms = transforms
         
    def __len__(self):
        return (len(self.X))
    
    def __getitem__(self, i):
        data = self.X.iloc[i]
        data = torch.tensor(data, dtype=torch.long)
        
        if self.transforms:
            data = self.transforms(data)
            
        if self.y is not None:
            return (data, self.y[i])
        else:
            return data
        
print(train_data)
train_data = TwitterDataset(train_data, train_target)
validation_data = TwitterDataset(validation_data, validation_target)

print(train_data[0])

6101                                     [530, 1751, 320]
3298    [631, 2598, 832, 275, 788, 2181, 105, 1038, 7,...
6817    [3, 295, 136, 243, 64, 113, 136, 1120, 2298, 9...
3801                         [203, 1834, 571, 8, 2317, 1]
7088    [1415, 1308, 1067, 1630, 1476, 3617, 1084, 121...
                              ...                        
7419                 [36, 328, 100, 258, 73, 767, 319, 1]
3776    [8, 108, 2033, 404, 8, 1146, 966, 940, 56, 2, ...
6215         [198, 140, 884, 786, 1057, 2, 21, 1037, 131]
4695    [3552, 17, 191, 1321, 96, 1227, 668, 3552, 314...
1459                              [1066, 151, 89, 866, 1]
Name: text, Length: 6090, dtype: object
(tensor([ 530, 1751,  320]), 0)


### Create model

In [45]:
print(weights_matrix.shape)

(12572, 50)


In [46]:
model = TwitterClassifier(torch.tensor(weights_matrix, dtype=torch.float)).to(device)
#model = TwitterClassifier(torch.tensor(embedding_matrix, dtype=torch.float)).to(device)
model


TwitterClassifier(
  (embedding): EmbeddingBag(12572, 50, mode=mean)
  (fc): Linear(in_features=50, out_features=2, bias=True)
)

In [47]:
def generate_batch(batch):
    label = torch.tensor([entry[1] for entry in batch])
    text = [entry[0] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [48]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    # dataloaders
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

### Train the model

In [49]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

sub_train_, sub_valid_ = train_data, validation_data

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 1 seconds
	Loss: 0.0507(train)	|	Acc: 48.5%(train)
	Loss: 0.0011(valid)	|	Acc: 49.4%(valid)
Epoch: 2  | time in 0 minutes, 1 seconds
	Loss: 0.0457(train)	|	Acc: 52.4%(train)
	Loss: 0.0011(valid)	|	Acc: 52.4%(valid)
Epoch: 3  | time in 0 minutes, 1 seconds
	Loss: 0.0438(train)	|	Acc: 54.7%(train)
	Loss: 0.0010(valid)	|	Acc: 55.5%(valid)
Epoch: 4  | time in 0 minutes, 1 seconds
	Loss: 0.0428(train)	|	Acc: 56.2%(train)
	Loss: 0.0010(valid)	|	Acc: 57.3%(valid)
Epoch: 5  | time in 0 minutes, 1 seconds
	Loss: 0.0423(train)	|	Acc: 57.7%(train)
	Loss: 0.0009(valid)	|	Acc: 58.6%(valid)
Epoch: 6  | time in 0 minutes, 1 seconds
	Loss: 0.0419(train)	|	Acc: 58.4%(train)
	Loss: 0.0009(valid)	|	Acc: 58.5%(valid)
Epoch: 7  | time in 0 minutes, 1 seconds
	Loss: 0.0417(train)	|	Acc: 58.8%(train)
	Loss: 0.0009(valid)	|	Acc: 58.9%(valid)
Epoch: 8  | time in 0 minutes, 1 seconds
	Loss: 0.0415(train)	|	Acc: 59.3%(train)
	Loss: 0.0009(valid)	|	Acc: 58.9%(valid)
Epoch: 9  | time

In [50]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

def predict(text, model, vocab, ngrams):
    ##tokenizer = get_tokenizer("basic_english")
    print(text)
    text = text_preprocessing(text)
    print(text)
    text = tokenizer.texts_to_sequences([text])
    print(text)
    with torch.no_grad():
        text = torch.tensor(text[0])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()

ex_text_str = "xyz"
vocab = target_vocab #train_dataset.get_vocab()
model = model.to("cpu")

print("This is a %s news" % predict(ex_text_str, model, vocab, 2))

xyz
xyz
[[]]


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding_bag)

### Predict and create submission

In [None]:
submission_data = [1 if p > 0.5 else 0 for p in prediction]
print(len(submission_data))

In [None]:
def submission(submission_file_path,submission_data):
    sample_submission = pd.read_csv(submission_file_path)
    prediction = model.predict(test_text)
    sample_submission["target"] = submission_data
    sample_submission.to_csv("submission.csv", index=False)

In [None]:
submission_file_path = "../input/sample_submission.csv"
submission(submission_file_path,submission_data)

### TODO Impute missing keywords

In [None]:
from sklearn.impute import SimpleImputer

print(X_valid[X_valid["keyword"].isnull()])
print(X_valid["keyword"].shape)

# replace missing keyword with "missing_value"
imputer = SimpleImputer(missing_values=np.NaN, strategy='constant')
X_train["keyword"] = imputer.fit_transform(X_train["keyword"].to_numpy().reshape(-1, 1))
X_valid["keyword"] = imputer.transform(X_valid["keyword"].to_numpy().reshape(-1, 1))

print(X_valid[X_valid["keyword"].isnull()])
print(X_valid[X_valid["id"]  == 10864])



### One-hot encode keyword

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)

asd = encoder.fit_transform(X_train["keyword"].to_numpy().reshape(-1, 1))
#encoder.transform(X_valid["keyword"].to_numpy().reshape(-1, 1))

print(asd)


In [None]:
print(asd[0])