In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize  
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
import re
import string

import spacy
spacy_en_model = spacy.load('en_core_web_lg')

from sklearn import preprocessing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import unidecode

In [None]:
training_set = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_set = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
all_data = pd.concat([training_set, test_set])

# Preprocessing and feature engineering:

## Remove missing values:

In [None]:
all_data = all_data.drop(columns=['keyword', 'location'])

## Clean text:

In [None]:
pd.options.display.max_colwidth = 150
all_data.iloc[0:20]

In [None]:
def custom_preprocessor(text):
    
    # Remove urls
    text = re.sub('https:\S+', '', text)
    text = re.sub('http:\S+', '', text)
    
    # eg. RockyFire --> Rocky Fire
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)
    
    # Make text lowercase
    text = text.lower()
    
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    # specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "cannot", text)
    
    # Expand abbreviations
    # TO DO
    
    # Remove accents
    text = unidecode.unidecode(text)
    
    # Remove non-word characters
    text = re.sub("\W"," ",text)
    
    # Remove line breaks
    text = re.sub('\n', '', text)
    
    # Remove emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    # Remove more than one whitespace
    text = re.sub(r" +", " ", text)
    
    # Standardize elongated vowels:
    text_list = []
    for word in text.split(' '):
        elong_vowels = re.search(r'(\w)\1{2,}', word)
        if elong_vowels:
            word_shortened_two = re.sub(r'(\w)\1{2,}', r'\1\1', word)
            word_shortened_one = re.sub(r'(\w)\1{2,}', r'\1', word)
            if word_shortened_two in words.words():
                text_list.append(word_shortened_two)
            elif word_shortened_one in words.words():
                text_list.append(word_shortened_one)
            else:
                text_list.append(word)
        else:
            text_list.append(word)
    text = ' '.join(text_list)
    
    return text

In [None]:
tweet = 'û'

In [None]:
custom_preprocessor(tweet)

In [None]:
all_data['text_cleaned'] = all_data['text'].apply(lambda x : custom_preprocessor(x))

In [None]:
all_data.iloc[0:20]

## Optional extra preprocessing:

In [None]:
# Remove stop words except 'not' and 'can'
stop_words = set(stopwords.words('english'))  

def remove_stop_words(text):
    word_tokens = word_tokenize(text) 
    filtered_sent = ' '.join([w for w in word_tokens if not w in stop_words or w in ['not', 'can']])
    return filtered_sent

all_data['text_no_sw'] = all_data['text_cleaned'].apply(lambda x : remove_stop_words(x))

In [None]:
# Lemmatize text
def lem(text):
    lemmed_sent = []
    doc = spacy_en_model(text)
    for token in doc:
        if token.lemma_ != '-PRON-':
            lemmed_sent.append(token.lemma_)
    lemmed_sent = ' '.join(lemmed_sent)

    return lemmed_sent

In [None]:
lem('our deeds are the reason of this earthquake may allah forgive us all')

In [None]:
all_data['text_lemmatized'] = all_data['text_cleaned'].apply(lambda x : lem(x))
all_data['text_no_sw_lemmatized'] = all_data['text_no_sw'].apply(lambda x : lem(x))

In [None]:
all_data.iloc[0:20]

## Training-test split:

In [None]:
# Select type of preprocessed text to use
text_type = 'text_no_sw'

xtrain = all_data[all_data['target'].notnull()][text_type]
ytrain = all_data[all_data['target'].notnull()]['target']
x_val = all_data[all_data['target'].isnull()][text_type]

In [None]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(xtrain, ytrain, test_size = 0.1)

## Vectorize text:

In [None]:
# Create word index
from keras.preprocessing import sequence, text

token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xtest))

xtrain_seq = token.texts_to_sequences(xtrain)
xtest_seq = token.texts_to_sequences(xtest)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xtest_pad = sequence.pad_sequences(xtest_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
word_index

In [None]:
# Create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = spacy_en_model(word)[0].vector
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
np.savetxt("embedding_matrix.csv", embedding_matrix, delimiter=",")

In [None]:
embedding_matrix = np.loadtxt('embedding_matrix.csv', delimiter=",")

# Model selection and training:

In [None]:
from keras.utils import np_utils

# Binarise labels for neural network
ytrain_enc = np_utils.to_categorical(ytrain)
ytest_enc = np_utils.to_categorical(ytest)

In [None]:
from keras.models import Sequential

from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization

from keras.callbacks import EarlyStopping

In [None]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
# model.add(Dropout(0.2))
model.add(LSTM(300))
model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))

In [None]:
import keras.optimizers
from keras.optimizers import Adam
opt = keras.optimizers.Adam(learning_rate=0.0002)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

In [None]:
history = model.fit(xtrain_pad,
          y=ytrain_enc,
          batch_size=64,
          epochs=4,
          verbose=1,
          validation_data=(xtest_pad, ytest_enc),
          callbacks=[earlystop]
         )

In [None]:
history_df = pd.DataFrame(history.history)
history_df['loss'].plot()
history_df['val_loss'].plot()
history_df['val_accuracy'].plot()

## Make predictions:

In [None]:
x_val_seq = token.texts_to_sequences(x_val)
x_val_pad = sequence.pad_sequences(x_val_seq, maxlen=max_len)

In [None]:
prediction = model.predict_classes(x_val_pad)

In [None]:
output = pd.DataFrame({'id': test_set.id, 'target': prediction})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")