In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer
import os

DATA_DIR = '/content/drive/MyDrive/codeml/challenge-4/data'

train_file = os.path.join(DATA_DIR, 'train.csv')
test_file = os.path.join(DATA_DIR, 'test_text.csv')

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [None]:
import tensorflow as tf
import keras
from keras import layers
from keras.layers import Input, Dense, Dropout

In [None]:
import gensim

In [None]:
class RedditCorpus:
    def __init__(self, filename):
        self.df = pd.read_csv(filename)
    
    def __iter__(self):
        for i, row in self.df.iterrows():
            yield gensim.utils.simple_preprocess(row.text)

In [None]:
VEC_SIZE = 100

In [None]:
word2vec = gensim.models.Word2Vec(
    sentences = RedditCorpus(train_file),
    size = VEC_SIZE,
    workers = 4,
    window = 10,
    min_count = 2,
)

In [None]:
token_id_map = {word: (i + 1) for i, word in enumerate(list(word2vec.wv.vocab.keys()))}

In [None]:
def doc2vec(text):
    vectors = [word2vec.wv[token] for token in text if token in word2vec.wv.vocab]
    
    return np.mean(vectors, axis=0) if len(vectors) else np.zeros(VEC_SIZE)

def text_to_token_ids(text):
    return np.array([token_id_map.get(token, 0) for token in text if token in token_id_map])

In [None]:
embeddings = {(i + 1): word2vec.wv[word] for i, word in enumerate(list(word2vec.wv.vocab.keys()))}
embeddings[0] = np.zeros(VEC_SIZE)

In [None]:
train_corpus = RedditCorpus(train_file)

X = [text_to_token_ids(text) for text in train_corpus]
X = np.array(X)

  after removing the cwd from sys.path.


In [None]:
SEQUENCE_LEN = 120

In [None]:
X_padded = tf.keras.preprocessing.sequence.pad_sequences(
    X,
    padding='post',
    value=0,
    maxlen=SEQUENCE_LEN,
)

In [None]:
value_map = {subreddit: i  for i, subreddit in enumerate(train_df.subreddit.unique())}
value_map_inv = {i: subreddit  for i, subreddit in enumerate(train_df.subreddit.unique())}

In [None]:
y = train_df.subreddit.apply(lambda subreddit: value_map.get(subreddit)).values

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_padded, y, test_size = .1)

In [None]:
model = tf.keras.Sequential([
    layers.Embedding(
        len(embeddings),
        VEC_SIZE,
        input_length=SEQUENCE_LEN,
        trainable=True),
    layers.Bidirectional(layers.LSTM(64, input_shape=(SEQUENCE_LEN, VEC_SIZE))),
    layers.Dense(32),
    layers.Dense(len(value_map), activation='softmax')
])
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(), optimizer='adam', metrics='accuracy')
model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 120, 100)          2726600   
                                                                 
 bidirectional_13 (Bidirecti  (None, 128)              84480     
 onal)                                                           
                                                                 
 dense_25 (Dense)            (None, 32)                4128      
                                                                 
 dense_26 (Dense)            (None, 5)                 165       
                                                                 
Total params: 2,815,373
Trainable params: 2,815,373
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics='accuracy',
)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_padded, y))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(16)

# Prepare the validation dataset
# val_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
# val_dataset = val_dataset.batch(16)

In [None]:
model.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


KeyboardInterrupt: ignored

In [None]:
y_pred = model.predict(X_valid)
f1_score(y_valid, np.argmax(y_pred, axis=1), average='micro')

0.9819605173587475

In [None]:
test_corpus = RedditCorpus(test_file)

In [None]:
X_test = [text_to_token_ids(text) for text in test_corpus]
X_test = np.array(X_test)

  


In [None]:
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(
    X_test,
    padding='post',
    value=0,
    maxlen=SEQUENCE_LEN,
)

In [None]:
y_pred_test = np.argmax(model.predict(X_test_padded), axis=1)

In [None]:
pd.DataFrame({
    'id': test_df.index.tolist(),
    'subreddit': [value_map_inv.get(y) for y in y_pred_test]
}).to_csv('Submission-2.csv', index = False)