<h1><center> Classification using Keras </center></h1>

<div style="text-align: right"> Amen Memmi</div>
<div style="text-align: right"> amen.memmi@mail.mcgill.ca</div>
<div style="text-align: right">  ID: 260755070</div>

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

from keras.models import Sequential,Model
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense
from keras.layers import GlobalAveragePooling1D,Input,Conv1D,MaxPooling1D,Flatten
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Import the data

In [2]:
data = pd.read_csv('train_input.csv')
data['category'] =  pd.read_csv('train_output.csv')['category']
data = data.drop(columns={'id'})
data.tail()

Unnamed: 0,conversation,category
164995,"<speaker_1> 2015 nfl draft "" i told you so "" t...",nfl
164996,<speaker_1> pk subban on lundqvist 's <number>...,hockey
164997,<speaker_1> kyrie irving and kevin love had a ...,nba
164998,<speaker_1> miroslav klose has the broken the ...,soccer
164999,<speaker_1> attorney charged with having sex w...,news


In [3]:
def clean_str(s):
    """Clean sentence"""
    for expr in [r"</d>", r"</s>",r"<speaker_1>",r"<speaker_2>",r"[^A-Za-z0-9(),!?\'\`]"]:
        s = re.sub(expr, " ", s)
    for expr in [r"\'s",r"\'ve",r"\'t",r"\'re",r"\'d",r"\'ll",]:
        s = re.sub(expr, " "+expr[1:], s)
    for expr in [r",",r"!",r"\(",r"\)"r"\?"]:
        s = re.sub(expr, " "+expr[1:]+" ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r'\S*(x{2,}|X{2,})\S*',"xxx", s)
    s = re.sub(r'[^\x00-\x7F]+', "", s)
    return s.strip().lower()

#### Clean the conversations

In [4]:
data["conversation"] = data["conversation"].apply(lambda x: clean_str(x))

#### Transform the output into multiple classes vectors

In [7]:
for c in data['category'].unique():
    data[c] = np.zeros(len(data)).astype(int)
u = data['category'].map({'news':2, 'nfl':3, 'soccer':4, 'movies':5, 'politics':6, 'hockey':7, 'nba':8,'worldnews':9})
for i in tqdm_notebook(range(len(data))):
    data.iloc[i,u[i]] = 1

HBox(children=(IntProgress(value=0, max=165000), HTML(value='')))




In [56]:
data.head()

Unnamed: 0,conversation,category,news,nfl,soccer,movies,politics,hockey,nba,worldnews
0,seaworld ceo steps down amid tanking revenues ...,news,1,0,0,0,0,0,0,0
1,strickland chargers owner dean spanos and gold...,nfl,0,1,0,0,0,0,0,0
2,iniesta plays keepy uppy with one leg man yout...,soccer,0,0,1,0,0,0,0,0
3,chappie trailer 1 number hugh jackman sci fi c...,movies,0,0,0,1,0,0,0,0
4,why the church of satan may get to open your c...,politics,0,0,0,0,1,0,0,0


In [11]:
# data.to_csv('data_nn_ready.csv',index=False)

In [None]:
# data = pd.read_csv('data_nn_ready.csv')

#### Split the data into training and test sets

In [23]:
X_train, X_test, y_train, y_test = train_test_split(data.conversation, data[data['category'].unique()], test_size=0.25, random_state=42)

#### Show input max sequence length
If the max input sequence length is too long, we can put a limit to it in order to reduce the training time.

In [24]:
xLengths = [len(word_tokenize(x)) for x in X_train]
h = sorted(xLengths)  #sorted lengths
maxLength =h[len(h)-1]
print("max input length is: ",maxLength)


max input length is:  1002


#### Choose a smaller max length for input sequence to reduce the training time


In [25]:
maxLength = h[int(len(h) * 0.8)]
print("80% covers input sequence length up to",maxLength)

70% cover input sequence length up to 114


#### Tokenize the conversations and padding the sequences

In [26]:
max_vocab_size = 200000
input_tokenizer = Tokenizer(max_vocab_size)
input_tokenizer.fit_on_texts(X_train)
input_vocab_size = len(input_tokenizer.word_index) + 1
print("input_vocab_size:",input_vocab_size)
totalX = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_train), maxlen=maxLength))

input_vocab_size: 102837


In [27]:
num_categories = data['category'].nunique()

#### Create the model: stacking layers 

In [28]:
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(input_vocab_size, EMBEDDING_DIM,input_length = maxLength))
model.add(GRU(256, dropout=0.1, return_sequences=True))
model.add(GRU(256, dropout=0.1))
model.add(Dense(32, activation='relu'))
model.add(Dense(num_categories, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Train model

In [30]:
history = model.fit(totalX,  y_train.values, validation_split=0.3, batch_size=128, epochs=3)

Train on 86625 samples, validate on 37125 samples
Epoch 1/3


Epoch 2/3




Epoch 3/3




#### Tokenize test set

In [39]:
max_vocab_size = 200000
test_tokenizer = Tokenizer(max_vocab_size)
test_tokenizer.fit_on_texts(X_test)
test_vocab_size = len(test_tokenizer.word_index) + 1
print("test_vocab_size:",test_vocab_size)
totalX_test = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_test), maxlen=maxLength))

test_vocab_size: 63956


#### Make class prediction on the test set

In [41]:
pred = model.predict_classes(totalX_test)

In [86]:
map_class = {'news':0, 'nfl':1, 'soccer':2, 'movies':3, 'politics':4, 'hockey':5, 'nba':6,'worldnews':7}
inv_map = {v: k for k, v in map_class.items()}

In [92]:
# Performance 
np.mean(pred==data.loc[y_test.index].category.map(map_class).values)

0.9100848484848485