In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import LSTM
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

seed = 7
np.random.seed(seed)

Using TensorFlow backend.


In [2]:
nltk.download('stopwords')
stop = set(stopwords.words('english'))
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/olewis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean_sentence(sentence):
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(D|P)', sentence)
    sentence = re.sub('[\W]+', ' ', sentence.lower()) + ''.join(emoticons).replace('-', '')
    sentence = re.sub(r'(\d+)([a-z+]+)', r'n_age \2', sentence)
    sentence = re.sub(r'([a-z+]+)(\d+)', r'\1 n_age', sentence)
    sentence = re.sub(r'(\d+)', r'n_age', sentence)
    slang_words = {' u ': ' you ', ' ur ': ' your ', ' ru ': ' are you ',
                   ' r ': ' are ', ' k ': ' okay ', ' ok ': ' okay ', ' ya ': ' yes ',
                   ' wd ': ' with ', ' hv ': ' have ', ' gv ': ' give ', ' bf ': ' boyfriend ', ' gf ': ' girlfriend ',
                   ' lez ': ' lesbian ', ' les ': ' lesbian ', ' m ': ' male ', ' f ': ' female ',
                   ' wanna ': ' want to ', ' gonna ': ' going to ',
                   ' lesbo ': ' lesbian ', ' bc ': ' because ', ' plz ': ' please ', ' don t ': ' dont ',
                   ' can t ': ' cant ', ' won t ': ' wont '}
    for k, v in slang_words.items():
        sentence = sentence.replace(k, v)

    # Stemming words and removing stopwords:
    sentence = " ".join([word for word in sentence.split() if word not in stop])
    return sentence

In [4]:
dataframe_train = pd.read_csv("data/train.csv")
dataset_train = dataframe_train.values
X_train = dataset_train[:, 1].astype(str)
X_train = map(clean_sentence, X_train)
Y_train = dataset_train[:, 0]

In [5]:
dataframe_test = pd.read_csv("data/test.csv")
dataset_test = dataframe_test.values
X_test = dataset_test[:, 1].astype(str)
X_test = map(clean_sentence, X_test)
Y_test = dataset_test[:, 0]

In [6]:
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
encoded_Y_test = encoder.transform(Y_test)

In [7]:
y_train_cat = np_utils.to_categorical(encoded_Y_train)
y_test_cat = np_utils.to_categorical(encoded_Y_test)
print (y_train_cat.shape, y_test_cat.shape)

((14048, 17), (3599, 17))


In [8]:
nb_classes = np.max(encoded_Y_train)+1
print(nb_classes, 'classes')

(17, 'classes')


In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_num = tokenizer.texts_to_sequences(X_train)
X_test_num = tokenizer.texts_to_sequences(X_test)
X_train_mat = tokenizer.sequences_to_matrix(X_train_num)
X_test_mat = tokenizer.sequences_to_matrix(X_test_num)

In [10]:
print('X_train shape:', X_train_mat.shape)
print('X_test shape:', X_test_mat.shape)

('X_train shape:', (14048, 10249))
('X_test shape:', (3599, 10249))


In [36]:
batch_size = 100
nb_epoch = 50

In [42]:
	# create model
def baseline_model():
    model = Sequential()
    model.add(Dense(100, input_shape=(10249,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.7))   
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [43]:
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1)
estimator.fit(X_train_mat, y_train_cat)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x184775f90>

In [44]:
predictions = estimator.predict(X_test_mat)
print(set(predictions))
print(encoder.inverse_transform(predictions))

set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
['misc' 'misc' 'personal' ..., 'meetup' 'personal' 'personal']


In [45]:
print 'macro f1:', f1_score(encoded_Y_test, predictions, average='macro')

macro f1: 0.486282499478
