In [None]:

import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_union
import pickle

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')
train_text = train['comment_text']
test_text = test['comment_text']

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=30000)

with open('temp_res/word_vectorizer.pk', 'wb') as fin:
    pickle.dump(word_vectorizer, fin)

with open('temp_res/char_vectorizer.pk', 'wb') as fin:
    pickle.dump(char_vectorizer, fin)

if __name__ == '__main__':
    vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2)
    vectorizer.fit(train_text)
    train_features = vectorizer.transform(train_text)
    test_features = vectorizer.transform(test_text)
    sparse.save_npz("temp_res/train_features.npz", train_features)
    sparse.save_npz("temp_res/test_features.npz", test_features)