In [0]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk
from keras.models import Sequential 
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, BatchNormalization
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [0]:
train_data_path = "../train_data.csv"
train_label_path = "../train_label.csv"
test_data_path = "../test_data.csv"

In [0]:
def load_train_data(train_data_path=train_data_path, train_label_path=train_label_path):
    raw_train_data = pd.read_csv(train_data_path)
    full_df = pd.merge(raw_train_data, pd.read_csv(train_label_path), on="id")
    full_df.fillna("No text", inplace=True)
    return full_df

In [0]:
df = load_train_data()

### Data is loaded, now off to cleaning..

In [0]:
def clean_text(text):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))
    text = text.lower() 
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

In [0]:
df["Corrected_text"] = df.Corrected_text.apply(lambda x: clean_text(x)).str.replace('\d+', '')

### now we're ready to start tokenize the text and start the data prepping



In [0]:
def tokenize_text(MAX_NB_WORDS = 50000, EMBEDDING_DIM = 100, MAX_SEQUENCE_LENGTH = 250):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
    tokenizer.fit_on_texts(df.Corrected_text.values)
    word_index = tokenizer.word_index
    
    print('Total of %s unique tokens found.' % len(word_index))
    X = tokenizer.texts_to_sequences(df.Corrected_text.values)
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    Y = pd.get_dummies(df.Humour).values
    return X, Y

In [0]:
X, Y = tokenize_text()

### and we're ready to start the splitting and training

In [0]:
def create_split(X=X, Y=Y,  test_size = 0.10, random_state = 42):
    
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = test_size, random_state = random_state)
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = create_split()

In [0]:
def initiate_model_traning(X_train=X_train, Y_train=Y_train, epochs=1, batch_size=64, input_length=X.shape[1]):
    
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(BatchNormalization())
    model.add(LSTM(100, dropout=0.4, recurrent_dropout=0.4, kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l1(0.01)))
    model.add(Dense(4, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


    history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)
    return model

In [0]:
model = create_and_train()

### Done with the training, now let's make our predictions. PS. don't forget to save the model!

In [0]:
def make_predictions(model=model, test=pd.read_csv(test_data_path), MAX_SEQUENCE_LENGTH=250): 
    test = test.fillna("")
    seq = tokenizer.texts_to_sequences(test.Corrected_text)
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
    labels = {0:'funny', 1:'hilarious', 2:'not_funny', 3:'very_funny'}
    pred = list(model.predict_classes(padded))
    pred = np.array([labels[i] for i in pred])
    test["Humour"] = pred
    test = test.drop(["Corrected_text"], axis=1)
    test.to_csv("predictions_LSTM.csv", index=False)
    print("Predictions are made! Ready to upload.")
pred = make_predictions()
pred

In [0]:
# model.save("lstm.h5")