### Let's load data to see what we are dealing with ###

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data=pd.read_csv('train.csv',encoding='utf-8')
test_data=pd.read_csv('test.csv',encoding='utf-8')

In [3]:
train_data.iloc[629]

id                                                  id16017
text      "C'est à vous à faire," said his Majesty, cutt...
author                                                  EAP
Name: 629, dtype: object

In [4]:
test_data.head(3)

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...


### Lets see the size, and if we have missing values in the datasets ###

In [5]:
print(train_data.shape,test_data.shape)
print(train_data.isnull().sum())
print(test_data.isnull().sum())

((19579, 3), (8392, 2))
id        0
text      0
author    0
dtype: int64
id      0
text    0
dtype: int64


In [6]:
train_data.author.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

### Ok, seems clean ###


### Lets make categories of authors ###

In [7]:
def make_labels(train_data):
    authors=train_data.author.unique()
    dic={}
    for i,author in enumerate(authors):
        dic[author]=i
    labels=train_data.author.apply(lambda x:dic[x])
    return labels, dic

labels,dic=make_labels(train_data)

### Before tokenizing lets get rid of punctuations and stop words ###

In [8]:
import nltk
from nltk.stem import porter
from nltk.tokenize import RegexpTokenizer

from nltk.corpus import stopwords

In [9]:
def lemmatize_text(train_data,test_data):

    lemma = nltk.wordnet.WordNetLemmatizer()

    tokenizer = RegexpTokenizer(r'\w+')
    stops=set(stopwords.words('english'))
    def sentence_preproc(sentence):
        return " ".join([lemma.lemmatize(word) for word in tokenizer.tokenize(sentence) 
                         if word not in stops]).encode('ascii', errors='ignore')

    train_data['processed_text']=train_data['text'].apply(lambda x:sentence_preproc(x))
    test_data['processed_text']=test_data['text'].apply(lambda x:sentence_preproc(x))

In [10]:
lemmatize_text(train_data,test_data)

### Tokenize text of the training data with keras text preprocessing functions ###

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [12]:
EMBEDDING_DIM=300
NUM_WORDS=50000

In [13]:
def tokenize_text(train_data,test_data):
    texts=np.append(train_data.processed_text,test_data.processed_text)
    tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                          lower=True)
    tokenizer.fit_on_texts(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    return tokenizer, word_index

tokenizer, word_index=tokenize_text(train_data,test_data)

Found 25339 unique tokens.


### word embedding ###

### lets load the pretrain Word2Vec model from Google https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit ###
### It might take time since it contains contains 300-dimensional vectors for 3 million words and phrases ###

In [14]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors
from keras.layers import Embedding

import pickle



def make_embedding(word_index,bin_file='../GoogleNews-vectors-negative300.bin'):
    word_vectors = KeyedVectors.load_word2vec_format(bin_file, binary=True)

    vocabulary_size=min(len(word_index)+1,NUM_WORDS)
    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i>=NUM_WORDS:
            continue
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

    with open('embed_matrix.pickle', 'wb') as handle:
        pickle.dump(embedding_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return embedding_matrix

def make_embedding_layer():
    vocabulary_size=min(len(word_index)+1,NUM_WORDS)
    try:
        with open('embed_matrix.pickle', 'rb') as handle:
            embedding_matrix = pickle.load(handle)
        if embedding_matrix.shape!=(vocabulary_size,300):
            embedding_matrix=make_embedding(word_index)
    except:
        embedding_matrix=make_embedding(word_index)
    embedding_layer = Embedding(vocabulary_size,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                trainable=True)
    return embedding_layer

In [19]:

def make_data(train_data,tokenizer,random_seed):
    val_data=train_data.sample(frac=0.10,random_state=random_seed)
    train_data=train_data.drop(val_data.index)
    sequences_train = tokenizer.texts_to_sequences(train_data.processed_text)
    sequences_valid=tokenizer.texts_to_sequences(val_data.processed_text)
    X_train = pad_sequences(sequences_train)
    X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])
    y_train = to_categorical(np.asarray(labels[train_data.index]))
    y_val = to_categorical(np.asarray(labels[val_data.index]))
       
    sequences_test=tokenizer.texts_to_sequences(test_data.processed_text)
    X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])
    return X_train,y_train,X_val,y_val, X_test

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers

def build_model(X_train,y_train):
    sequence_length = X_train.shape[1]
    filter_sizes = [3,4,5]
    num_filters = 100
    drop = 0.7

    embedding_layer=make_embedding_layer()
    inputs = Input(shape=(sequence_length,))
    embedding = embedding_layer(inputs)
    reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

    conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
    conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
    conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

    maxpool_0 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
    maxpool_1 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
    maxpool_2 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

    merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
    flatten = Flatten()(merged_tensor)
    reshape = Reshape((3*num_filters,))(flatten)
    dropout = Dropout(drop)(flatten)
    output = Dense(units=3, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = Model(inputs, output)
    return model

In [20]:
from keras import backend as K
def custom_loss(y_true,y_pred):
    eps=1e-15
    predictions = K.clip(y_pred, eps, 1. - eps)
    
    sum1=K.sum(y_true*K.log(predictions),axis=1)
    return -1.0*K.mean(sum1)
    

In [51]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
tests=5
y_pred=np.zeros((tests,X_test.shape[0],y_train.shape[1]))
for i in range(0,tests):
    X_train,y_train,X_val,y_val,X_test=make_data(train_data,tokenizer,random_seed=10*i)
    model=build_model(X_train,y_train)
    adam = Adam(lr=1e-3)
    
    model.compile(loss=custom_loss,#'categorical_crossentropy',
                  optimizer=adam,
                  metrics=[custom_loss,'acc'])
    callbacks = [EarlyStopping(monitor='val_loss', patience=0),
                 ModelCheckpoint('./model{}_checkpoint.hdf5'.format(i), monitor='val_loss', verbose=0, save_best_only=True)]
    model.fit(X_train, y_train, batch_size=300, epochs=20, verbose=1, validation_data=(X_val, y_val),
             callbacks=callbacks)  
    model.load_weights('./model{}_checkpoint.hdf5'.format(i))
    y_pred[i]=model.predict(X_test)
    



Train on 17621 samples, validate on 1958 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

KeyboardInterrupt: 

### now lets use our model to predict test data ###

In [None]:
y_predict=y_pred.mean(axis=0)

In [None]:
to_submit=pd.DataFrame(index=test_data.id,data={'EAP':y_predict[:,dic['EAP']],
                                                'HPL':y_predict[:,dic['HPL']],
                                                'MWS':y_predict[:,dic['MWS']]})

In [None]:
to_submit.to_csv('submit.csv')