You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 

You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import numpy as np
import pandas as pd
import os
import sys
from tqdm.notebook import tqdm_notebook
import re

tqdm_notebook.pandas()

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/glove-twitter-27b-fast/GloVe-Twitter-27B_Fast/glove.twitter.27B.100d.txt
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/preprocessed-tweets/preprocessed.csv


In [3]:
%%capture
!git clone https://github.com/lorenzo-mora/NLP-Challenge-Disaster-Tweets.git
%cd /kaggle/working/NLP-Challenge-Disaster-Tweets
!git pull

In [4]:
sys.path.insert(1, '/kaggle/working/NLP-Challenge-Disaster-Tweets')

In [42]:
data = pd.read_csv("/kaggle/input/preprocessed-tweets/preprocessed.csv", header = 0)

---
## Preprocessing

In [None]:
import preprocessing
from preprocessing import clean_text, stemming, lemmatization, extract_hashtags, extract_tags

In [None]:
data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", header = 0)

In [None]:
data.text[150]

In [None]:
data['cleaned_text'] = data.text.progress_apply(lambda t: clean_text(t, False, False))

In [None]:
data.cleaned_text[150]

In [None]:
# Remove rows with empty text after preprocessing
data = data.drop(data[data.cleaned_text==''].index).reset_index()

In [None]:
%%capture
!python3 -m nltk.downloader wordnet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
data['stemmed_text'] = data.cleaned_text.progress_apply(lambda t: stemming(t))
data['lemmatized_text'] = data.cleaned_text.progress_apply(lambda t: lemmatization(t))

In [None]:
print(data.stemmed_text[150])
print(data.lemmatized_text[150])

In [None]:
data['hashtags'] = data.text.progress_apply(lambda t: extract_hashtags(t))
data['tags'] = data.text.progress_apply(lambda t: extract_tags(t))

In [None]:
print(data.hashtags[150])
print(data.tags[150])

In [None]:
# data['tokenized_text'] = data.cleaned_text.progress_apply(lambda t: t.split())

In [None]:
# print(data.tokenized_text[150])

In [None]:
data.head(10)

In [None]:
data.to_csv('preprocessed.csv',index=False)

---
## Data Splitting

In [None]:
from classification import DataGenerator as dg

In [None]:
dataObj = dg(data.cleaned_text, data.target)
dataObj.split_data(random_state=42)
dataObj.tokenize_data(max_sequence_length=20)
dataObj.data.keys()

---
# Vectorization

In [6]:
from word_embedding import GloVe



In [7]:
data.head(5)

Unnamed: 0,index,id,keyword,location,text,target,cleaned_text,stemmed_text,lemmatized_text,hashtags,tags
0,0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us,deed reason earthquak mai allah forgiv us,deed reason earthquake may allah forgive u,['earthquake'],[]
1,1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,forest fire near la rong sask canada,forest fire near la ronge sask canada,[],[]
2,2,5,,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...,resid ask shelter place notifi offic evacu she...,resident asked shelter place notified officer ...,[],[]
3,3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...,peopl receiv wildfir evacu order california,people receive wildfire evacuation order calif...,['wildfires'],[]
4,4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...,got sent photo rubi alaska smoke wildfir pour ...,got sent photo ruby alaska smoke wildfire pour...,"['alaska', 'wildfires']",[]


In [61]:
gl = GloVe('/kaggle/input/glove-twitter-27b-fast/\
GloVe-Twitter-27B_Fast/glove.twitter.27B.100d.txt', "glove_1")

In [62]:
gl.prepare_data(data.cleaned_text, data.target,
                test_size=0.15, random_state=True, standardize=None)

In [63]:
gl.data.keys()

dict_keys(['x_train', 'x_test', 'y_train', 'y_test', 'vect_x_train', 'vect_x_test'])

In [64]:
print(gl.data['x_train'][0])
print(gl.data['vect_x_train'][0])
print(gl.vocabulary[5831], gl.vocabulary[745])

deeds reason earthquake may allah forgive us
[5831, 745, 169, 65, 1603, 5578, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
deeds reason


In [65]:
X_train_vect = gl.data['x_train']
Y_train = gl.data['y_train']
X_test_vect = gl.data['x_test']
Y_test = gl.data['y_test']

In [67]:
gl.embedding_matrix.shape

(14300, 100)

In [68]:
gl.EMBEDDING_DIM

100

---
# Neural Network

In [None]:
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, Flatten

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import scikitplot as skplt
import matplotlib.pyplot as plt

n_test = 0

In [None]:
if model:
    n_test += 1
    del model
    tf.keras.backend.clear_session()

In [None]:
inputs = Input(shape=(gl.MAX_SEQUENCE_LENGTH, ))
embeddings = Embedding(input_dim=len(gl.vocabulary)+1,
                       output_dim=gl.EMBEDDING_DIM,
                       input_length=gl.MAX_SEQUENCE_LENGTH,
                       trainable=False,
                       weights=[gl.embedding_matrix],
                       name=f"Embedding_{n_test}1")
flatten = Flatten(name=f"Flatten_{n_test}1")
dense_1 = Dense(1024, activation="relu", name=f"Dense_{n_test}1")
dense_2 = Dense(256, activation="relu", name=f"Dense_{n_test}2")
dense_3 = Dense(16, activation="relu", name=f"Dense_{n_test}3")
dense_4 = Dense(1, activation="sigmoid", name=f"Dense_{n_test}4")

x = embeddings(inputs)
x = flatten(x)
x = dense_1(x)
x = dense_2(x)
x = dense_3(x)
outputs = dense_4(x)

model = Model(inputs=inputs, outputs=outputs)
model._name=f"Test_Model_{n_test}"
model.summary()

In [None]:
# model = Sequential()
# model.add(Embedding(input_dim=len(gl.vocabulary)+1,
#                     output_dim=gl.EMBEDDING_DIM,
#                     input_length=gl.MAX_SEQUENCE_LENGTH,
#                     trainable=False,
#                     weights=[gl.embedding_matrix],
#                     name=f"Embedding_{n_test}1"))
# model.add(Flatten(name=f"Flatten_{n_test}1"))
# model.add(Dense(1024, activation="relu", name=f"Dense_{n_test}1"))
# model.add(Dense(256, activation="relu", name=f"Dense_{n_test}2"))
# model.add(Dense(16, activation="relu", name=f"Dense_{n_test}3"))
# model.add(Dense(1, activation="sigmoid", name=f"Dense_{n_test}4")) # softmax

# model._name=f"Test_Model_{n_test}"
# model.summary()

In [None]:
model.compile(optimizer="adam",
              loss="squared_hinge",
              metrics=["accuracy"])

In [None]:
target_classes = ["Neutral", "Disaster"]
classes = np.unique(Y_train)
mapping = dict(zip(classes, target_classes))

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                      patience=5,
                                      min_delta=.005,
                                      restore_best_weights=True,
                                      start_from_epoch=15)
model.fit(X_train_vect, Y_train,
          batch_size=128,
          epochs=300,
          validation_data=(X_test_vect, Y_test),
          callbacks=[es])

In [None]:
# Y_preds = model.predict(X_test_vect).argmax(axis=-1) # SOFTMAX
Y_preds = model.predict(X_test_vect)
Y_preds = [round(y[0]) for y in Y_preds] # SIGMOID

print("Test Accuracy : {}".format(accuracy_score(Y_test, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_test, Y_preds, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_test, Y_preds))

In [None]:
skplt.metrics.plot_confusion_matrix([target_classes[i] for i in Y_test],
                                    [target_classes[i] for i in Y_preds],
                                    normalize=True,
                                    title="Confusion Matrix",
                                    cmap="Blues",
                                    hide_zeros=True,
                                    figsize=(5,5)
                                    )
plt.xticks(rotation=90)

---
## Test

In [None]:
# Add escape character in emoticons
import emot
from emot.emo_unicode import EMOTICONS_EMO

def insert_escape(text:str)->str:

    chars = r'[\+|\[|\\|\^|\{|\(|\*|\||\}|\.|\]|\?|\$|\)|\/]'
    escape = "\\"

    res = [i.start() for i in re.finditer(chars, text)]

    for i in range(len(res)):
        idx = res[i]
        text = text[:idx+i] + escape + text[idx+i:]
    return text

new_emoticons = []
for s in EMOTICONS_EMO:
    temp = insert_escape(s)
    print(temp)
    new_emoticons.append(temp)

In [None]:
# Text in Uicode
text = "😂 ❤ ☮ 🙂 ❤ ©"
''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)