In [1]:
import numpy as np
import pandas as pd
import re
from V import V, glove

glove_dim = 25

31534it [00:00, 55982.40it/s]


In [2]:
def sanitize(t):
    t = re.sub(r'[^\x00-\x7F]+','', t)
    t = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', ' <url> ', t)
    t = re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)', ' <handle> ', t)
    t = re.sub(r'(#|\.|…|-|\?|!|:|;|%|_|\"|\'|&|“|’|”|\(|\)|\\|,|/|\*)', ' ', t)
    t = re.sub(r'\d+', ' <number> ', t)
    t = re.sub(r'\s+', ' ', t)
    t = t.lower()
    if(len(t)>1):
        if t[0] == ' ':
            t = t[1:]
        if t[-1] != ' ':
            t = t + ' '
    return t

In [3]:
with open('train.csv') as f:
    df = pd.read_csv(f)
df = df.drop('handle', axis=1)
df.set_index('Unnamed: 0', inplace=True)
df.index.name = 'id'
df['sanitized_text'] = df.text.map(sanitize)
df = df.drop('text', axis=1)
# df.head()

In [5]:
# with open('test.csv') as f:
#     test_df = pd.read_csv(f)
# test_df['sanitized_text'] = test_df.text.map(sanitize)
# pad = "<pad> "*4
# with open('corpus.txt', 'w') as f:
#     for tweet in df.sanitized_text:
#         f.write(tweet+pad)
#     for tweet in test_df.sanitized_text:
#         f.write(tweet+pad)

In [6]:
num_all = 0
num_bad = 0
bad_words = []

def get_vector(word):
    global num_all, num_bad, bad_words
    num_all += 1
    try:
        return glove[word]
    except:
        num_bad += 1
        bad_words.append(word)
        return None

def get_vectors(text):
    words = text.split()
    vectors = np.zeros((20, glove_dim))
    missed = 0
    for i, word in enumerate(words):
        vector = get_vector(word)
        if vector is not None:
            try:
                vectors[i-missed] = vector
            except:
                missed += 1
    return vectors

target_dict = {
    'biz&tech': np.array([1,0,0,0]),
    'celebrity': np.array([0,1,0,0]),
    'internetplatform': np.array([0,0,1,0]),
    'politician': np.array([0,0,0,1])
}

name_from_number = ['biz&tech', 'celebrity', 'internetplatform', 'politician']

class_weights = {
    0: 1,
    1: 0.14,
    2: 0.533,
    3: 0.42
}

def one_hot(target):
    return target_dict[target]


df['one_hot'] = df.target.map(one_hot)
df['vectors'] = df.sanitized_text.map(get_vectors)
df.head()

Unnamed: 0_level_0,target,sanitized_text,one_hot,vectors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,biz&tech,the case for learned index structures replacin...,"[1, 0, 0, 0]","[[0.367821, 0.304186, -0.609452, 1.882143, 0.9..."
1,biz&tech,<handle> lock you in a black box with a window...,"[1, 0, 0, 0]","[[1.287763, 0.304906, -0.47771, 1.626356, 1.01..."
2,biz&tech,what policy outcomes are you aiming to achieve...,"[1, 0, 0, 0]","[[0.96376, -0.303895, -0.180347, 1.166647, 0.1..."
3,biz&tech,machine learning for systems and systems for m...,"[1, 0, 0, 0]","[[-0.039726, -0.581507, -0.171636, 1.631949, 0..."
4,biz&tech,from the number of talks <handle> given this w...,"[1, 0, 0, 0]","[[0.94299, 0.327011, 0.79898, 0.678095, 1.0002..."


In [7]:
num_bad/num_all

0.0

In [8]:
train = df.sample(frac=0.9)
test = df.drop(train.index)
tests = {}
for num, target in enumerate(target_dict):
    tests[target] = test[test.target==target].sample(frac=class_weights[num]*10, replace=True)
test = pd.concat([t for t in tests.values()])
trainX, trainY = train.vectors.values, train.one_hot.values
testX, testY = test.vectors.values, test.one_hot.values
trainX = np.array([b for b in trainX])
trainY = np.array([b for b in trainY])
testX = np.array([b for b in testX])
testY = np.array([b for b in testY])

In [9]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout

In [14]:
model = Sequential([
    Input(shape=(None, glove_dim)),
    LSTM(48, activation='relu'),
    Dropout(0.3),
    Dense(28, activation='relu'),
    Dropout(0.3),
    Dense(4, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [15]:
model.fit(trainX, trainY, epochs=10, verbose=2, validation_data=(testX, testY), class_weight=class_weights)

Train on 28823 samples, validate on 10333 samples
Epoch 1/10
28823/28823 - 27s - loss: 0.3618 - acc: 0.5361 - val_loss: 0.9291 - val_acc: 0.6365
Epoch 2/10
28823/28823 - 24s - loss: 0.2690 - acc: 0.6920 - val_loss: 0.7258 - val_acc: 0.7179
Epoch 3/10
28823/28823 - 24s - loss: 0.2302 - acc: 0.7408 - val_loss: 0.7381 - val_acc: 0.7256
Epoch 4/10
28823/28823 - 24s - loss: 0.2154 - acc: 0.7591 - val_loss: 0.6114 - val_acc: 0.7727
Epoch 5/10
28823/28823 - 24s - loss: 0.2002 - acc: 0.7684 - val_loss: 0.5852 - val_acc: 0.7738
Epoch 6/10
28823/28823 - 24s - loss: 0.1876 - acc: 0.7878 - val_loss: 0.5526 - val_acc: 0.7932
Epoch 7/10
28823/28823 - 26s - loss: 0.1784 - acc: 0.7931 - val_loss: 0.5417 - val_acc: 0.7932
Epoch 8/10
28823/28823 - 26s - loss: 0.1716 - acc: 0.7998 - val_loss: 0.5628 - val_acc: 0.7863
Epoch 9/10
28823/28823 - 24s - loss: 0.1655 - acc: 0.8065 - val_loss: 0.5494 - val_acc: 0.8003
Epoch 10/10
28823/28823 - 24s - loss: 0.1608 - acc: 0.8080 - val_loss: 0.6228 - val_acc: 0.7940

<tensorflow.python.keras.callbacks.History at 0x7fbbf45fb6a0>

In [23]:
model.save('model.h5')

In [39]:
model.load_weights('model-added-test(90).h5')

In [41]:
model.evaluate(testX, testY)



[0.25254086967309125, 0.9075333]

In [42]:
with open('test.csv') as f:
    test_df = pd.read_csv(f)
test_df.index.name = 'id'
test_df['sanitized_text'] = test_df.text.map(sanitize)
test_df = test_df.drop('text', axis=1)
test_df['vectors'] = test_df.sanitized_text.map(get_vectors)
test_df.drop('id', axis=1, inplace=True)
# test_df.head()

In [43]:
test_eval = test_df.vectors.values
test_eval = np.array([b for b in test_eval])
preds = model.predict_classes(test_eval)

In [44]:
test_df['target'] = preds
def q(t):
    return "\""+t+"\""
test_df['target'] = test_df.target.apply(lambda x: q(name_from_number[x]))
test_df.drop('sanitized_text', axis=1, inplace=True)
test_df.drop('vectors', axis=1, inplace=True)
# test_df.head()

In [45]:
test_df.to_csv('predictions.csv', quoting=3)
