In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import multiprocessing

<gensim.interfaces.TransformedCorpus at 0x2435cad9358>

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
X_train = df_train.text
y_train_r = df_train.rating
y_train_p = df_train.positive
X_test = df_test.text
y_test_r = df_test.rating
y_test_p = df_test.positive

In [4]:
y_train_p = y_train_p.to_numpy()
y_test_p = y_test_p.to_numpy()
y_train_r = y_train_r.to_numpy()
y_test_r = y_test_r.to_numpy()
y_test_r_ker = keras.utils.to_categorical(y_test_r)
y_train_r_ker = keras.utils.to_categorical(y_train_r)

(50000,)

In [5]:
tfv = TfidfVectorizer(min_df=5, max_df=0.95, ngram_range=(1, 2))
tfv.fit(X_train)
X_train_tfv = tfv.transform(X_train)
X_test_tfv = tfv.transform(X_test)
X_train_tfv, X_val_tfv, y_train_r_ker, y_val_r_ker = train_test_split(X_train_tfv, y_train_r_ker, random_state=42, test_size = 0.1, shuffle=True)

In [6]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

Using TensorFlow backend.


In [7]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [8]:
def build_model(layers):
    klayers = [keras.layers.Dense(layers[0], activation=tf.nn.tanh, input_shape=(X_train_tfv.shape[1],)), keras.layers.Dropout(0.3), 
keras.layers.BatchNormalization()]
    for l in range(1, len(layers)):
        klayers.append(keras.layers.Dense(layers[l], activation=tf.nn.tanh))
        klayers.append(keras.layers.Dropout(0.3))
        klayers.append(keras.layers.BatchNormalization())

    klayers.append(keras.layers.Dense(8, activation='softmax'))
    model = keras.Sequential(klayers)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer,  metrics=[f1_m, 'accuracy'])
    return model

In [9]:
model = build_model([64, 128, 64, 32])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                5183040   
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
batch_normalization_v1 (Batc (None, 64)                256       
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
batch_normalization_v

In [10]:
es_callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model.fit(X_train_tfv, y_train_r_ker, epochs=10, batch_size=32, callbacks=[es_callback], validation_data=(X_val_tfv, y_val_r_ker))


Train on 22500 samples, validate on 2500 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 2592/22500 [==>...........................] - ETA: 24s - loss: 1.1492 - f1_m: 0.4568 - acc: 0.5428

KeyboardInterrupt: 

In [15]:
prediction = model.predict(X_test_tfv)

In [16]:
accuracy_score(y_test_r_ker, np.round(prediction))

0.2404

In [17]:
f1_score(y_test_r_ker, np.round(prediction), average='micro')

0.32003834069971776

In [16]:
prediction = model.predict(X_train_tfv)

In [17]:
accuracy_score(y_train_r_ker, np.round(prediction))

0.98612

In [19]:
f1_score(y_train_r_ker, np.round(prediction), average='macro')

0.9855282289554013

In [103]:
df_test.rating.unique()

array([1, 3, 0, 2, 7, 4, 6, 5], dtype=int64)

In [104]:
df_train.rating.unique()

array([2, 3, 0, 1, 6, 5, 7, 4], dtype=int64)

In [5]:
X_train = X_train.str.split(' ').to_numpy()
X_test = X_test.str.split(' ').to_numpy()
docs = np.concatenate((X_train, X_test))

model = Word2Vec(docs, min_count=3, window=5, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, workers=3)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  


In [7]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

(1, 100000)