In [None]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
import os
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

## Load data frames (created by preprocess.py)

In [None]:
df_train = pd.read_pickle('./text/newsgroups/train.pkl')
df_test = pd.read_pickle('./text/newsgroups/test.pkl')
print(df_train.shape)
print(df_test.shape)
df = pd.concat([df_train, df_test], axis=0)
print(df.shape)

## Create a corpus on entire dataset

In [None]:
corpus = df['tokens'].map(lambda x: ' '.join(x))

## Create Transformers with different vocabulary sizes

In [None]:
VOCABULARY_SIZES = [35000, 70000, 100000, None]

create_directory('./models')

In [None]:
def create_tfid_transformer(countVectorizer, corpus):
    transformer = Pipeline([
        ('count', countVectorizer),
        ('tfid', TfidfTransformer())
    ])

    transformer.fit(corpus)
    return transformer

In [None]:
for vocab_size in VOCABULARY_SIZES:
    countVectorizer = CountVectorizer(lowercase=False, token_pattern='\S+', max_features=vocab_size)
    countVectorizer.fit(corpus)
    vocabulary = countVectorizer.vocabulary_
    
    suffix = 'full' if vocab_size is None else str(vocab_size)

    transformer = create_tfid_transformer(countVectorizer, corpus)
    
    file_name = './models/tfidf_transformer_{}.pkl'.format(suffix)
    with open(file_name, 'wb') as f:
        pickle.dump(transformer, f, protocol=pickle.HIGHEST_PROTOCOL)
        print(file_name)

## Create Transformers with trimmed vocabulary

In [None]:
countVectorizer = CountVectorizer(max_df=0.9, min_df=0.0005, lowercase=False, token_pattern='\S+', max_features=None)
countVectorizer.fit(corpus)
vocabulary = countVectorizer.vocabulary_

suffix = 'trim'

transformer = create_tfid_transformer(countVectorizer, corpus)

file_name = './models/tfidf_transformer_{}.pkl'.format(suffix)
with open(file_name, 'wb') as f:
    pickle.dump(transformer, f, protocol=pickle.HIGHEST_PROTOCOL)
    print(file_name)