In [None]:

import numpy as np
import pandas as pd
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences


warnings.filterwarnings('ignore')

train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']

def concatenate_features(inps):
    """
    An input is under shape (nb_samples, nb_features)
    :param inps: a list of inputs used for multi-input model
    :return: the concatenated features under shape (nb_samples, nb_feature1 + nb_feature2 + ...)
    """
    if not inps:
        return []
    nb = len(inps[0])
    for inp in inps:
        assert(len(inp) == nb)
    concatenated = []
    for i in range(nb):
        row = []
        for inp in inps:
            row.extend(inp[i])
        concatenated.append(row)
    return concatenated

class SeqVectorizer(TfidfVectorizer):
    def transform2seq(self, raw_docs):
        """
        transform docs composed of words or n-grams into a sequence index 
        """
        analyzer = self.build_analyzer()
        index_docs = []
        for doc in raw_docs:
            index_docs.append([])
            for feature in analyzer(doc):
                index = self.vocabulary_.get(feature, -1)
                if index > 0:
                    index_docs[-1].append(index)
        return index_docs


def pad_sequence(seq, repl, maxlen):
    """
    Cut the part of sequence exceding maxlen, Or fill the sequence with "repl" until maxlen
    """
    out_seq = []
    nb = 0
    for t in seq:
        if nb < maxlen:
            out_seq.append(t)
            nb += 1
        else:
            return out_seq
    while len(out_seq) < maxlen:
        out_seq.append(repl)
    return out_seq

svs = []

w_v = SeqVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000) #keep 20000 most frequent words
svs.append((w_v, 250)) #keep the first 250 words for a comment

c_v3 = SeqVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(3, 3),
    max_features=20000) #keep 20000 most frequent 3*-grams
svs.append((c_v3, 400)) #keep the first 400 3-grams for a comment

for sv, _ in svs:
    sv.fit(train_text)

print("vocabulery lengths:")
for sv, _ in svs:
    print(len(sv.vocabulary_))

X_train = concatenate_features([
    pad_sequences(sv.transform2seq(train_text), maxlen=maxl,
                  value=len(sv.vocabulary_), truncating="post", padding="post") for sv, maxl in svs
])

X_test = concatenate_features([
    pad_sequences(sv.transform2seq(test_text), maxlen=maxl,
                  value=len(sv.vocabulary_), truncating="post", padding="post") for sv, maxl in svs
])

np.save("temp_res/train_mulinp250400", X_train)
np.save("temp_res/test_mulinp250400", X_test)

