In [91]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from file_utils import create_directory, save_to_pickle

## Change log
- The vocabulary size reduce from 12209 to 10293 after removing hashtags words.
- The vocabulary size reduce from 10293 to 4729 after adding min_df to exclude words ocurring once in the entire data.
- After apply max_df = 0.3 (freq >= 30%) only 1 word was removed (which seems to be "vaccine" - see eda notebook)

## Load data frames (created by preprocess.py)

In [92]:
df = pd.read_pickle('./text/vaccination_tweets.pkl')
print(df.shape)

(10920, 17)


## Create a corpus on entire dataset

In [93]:
corpus = df['tokens'].map(lambda x: ' '.join(x))

## Create Transformers

In [94]:
def create_tfid_transformer(countVectorizer, corpus):
    transformer = Pipeline([
        ('count', countVectorizer),
        ('tfid', TfidfTransformer())
    ])

    transformer.fit(corpus)
    return transformer

In [95]:
min_df = 2/df.shape[0] # words ocurring at least twice in the corpus
max_df = 0.3 # exclude words with freq >= 30%

countVectorizer = CountVectorizer(lowercase=False, token_pattern='\S+', max_features=None, min_df=min_df, max_df=max_df)
countVectorizer.fit(corpus)
vocabulary = countVectorizer.vocabulary_
print('vocabulary size:', len(vocabulary))
transformer = create_tfid_transformer(countVectorizer, corpus)
transformer


vocabulary size: 4726


In [96]:
MODEL_DIR = './models/vaccination_tweets'
create_directory(MODEL_DIR)

file_name = '{}/tfidf_transformer.pkl'.format(MODEL_DIR)
save_to_pickle(file_name, transformer)
print(file_name)

./models/vaccination_tweets/tfidf_transformer.pkl


In [97]:
X = transformer.transform(corpus).toarray()
file_name = '{}/train-full.pkl'.format(MODEL_DIR)
save_to_pickle(file_name, X)
print(file_name)

./models/vaccination_tweets/train-full.pkl


In [98]:
RANDOM_STATE = 42
for train_size in [60, 80]:
    X_train, X_test = train_test_split(X, train_size=train_size/100,
                                       random_state=RANDOM_STATE)
    
    suffix = '{}.pkl'.format(train_size)
    
    file_name = '{}/train-{}'.format(MODEL_DIR, suffix)
    save_to_pickle(file_name, X_train)
    print(file_name)
    
    file_name = '{}/test-{}'.format(MODEL_DIR, suffix)
    save_to_pickle(file_name, X_test)
    print(file_name)

./models/vaccination_tweets/train-60.pkl
./models/vaccination_tweets/test-60.pkl
./models/vaccination_tweets/train-80.pkl
./models/vaccination_tweets/test-80.pkl
