# generate training data set

- assume training data in folder 
    + `data/raw/trainSet.csv`
    + first column is query 
    + second column is 
- assume input strings are:
    + in lower case
    + punctuation removed (stop words included)
    + words separated by spaces (no padding)
- output:
    + `data/processed/train.csv` 
    + `data/processed/test.csv` 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

full_training_data_filepath  = '../data/raw/trainSet.csv'
split_training_data_filepath = '../data/processed/train.csv'
split_testing_data_filepath  = '../data/processed/test.csv'

In [2]:
# read in original data:
known_queries = pd.read_csv(full_training_data_filepath, header=None, names=['query', 'category'])
display(known_queries.sample(5))
known_queries.dtypes

Unnamed: 0,query,category
429735,large format tile cutter,1077
532305,flues for wood burning stoves,533
450837,cheese hamper waitrose,598
270383,glitter canvas,1379
334867,second hand concrete garages,1278


query       object
category     int64
dtype: object

In [3]:
# do not want the class label to be numerical nor ordinal. 
known_queries['category'] = pd.Categorical(known_queries['category'])

In [4]:

# split full data into training and testing sets, ensuring equal coverage of the categories.
df_train, df_test = train_test_split(known_queries, test_size=0.2, stratify=known_queries['category'])

print('split', len(known_queries), 'total records 100 %')
print('into ',      len(df_train),      'train records', round(100*len(df_train)/len(known_queries),0), '%')
print(' and ',       len(df_test),      'test  records',  round(100*len(df_test)/len(known_queries), 0), '%')

split 606823 total records 100 %
into  485458 train records 80.0 %
 and  121365 test  records 20.0 %


In [5]:
# save train to test split to files. read these files going forward
df_train.to_csv(split_training_data_filepath)
df_test.to_csv(split_testing_data_filepath)

In [6]:
df_train = pd.read_csv(split_training_data_filepath, index_col=0)
display(df_train.head(3))
df_train.dtypes

Unnamed: 0,query,category
286418,ibooks free books,1308
141292,baglamukhi yantra,728
41733,red lion stockbridge,1091


query       object
category     int64
dtype: object

In [7]:
df_test = pd.read_csv(split_testing_data_filepath, index_col=0)
display(df_test.head(3))
df_test.dtypes

Unnamed: 0,query,category
53412,most accurate pregnancy test,519
381911,dog sculptures,183
489319,steam room uk,169


query       object
category     int64
dtype: object

In [8]:
# verify evenness of split categories:
print('there are', len(df_train), 'records in training data, vs', len(df_test), 'in test data')
print('the training data for the smallest classes have', sorted(known_queries['category'].value_counts())[0:5], 'records')
print('check class balances:')
class_frequency_comparison = \
pd.DataFrame(
    {
        'full': known_queries['category'].value_counts()/len(known_queries),
        'training': df_train['category'].value_counts()/len(df_train),
        'testing': df_test['category'].value_counts()/len(df_test),
    })
display(class_frequency_comparison.head(3))
display(class_frequency_comparison.sample(3))
display(class_frequency_comparison.tail(3))

there are 485458 records in training data, vs 121365 in test data
the training data for the smallest classes have [4, 10, 14, 14, 16] records
check class balances:


Unnamed: 0,full,training,testing
0,0.000747,0.000746,0.00075
1,0.000666,0.000665,0.000667
2,0.000959,0.00096,0.000956


Unnamed: 0,full,training,testing
1342,0.00072,0.000721,0.000717
1357,0.000936,0.000935,0.000939
702,0.000381,0.000381,0.000379


Unnamed: 0,full,training,testing
1416,0.000826,0.000826,0.000824
1417,0.000859,0.000859,0.000857
1418,0.000747,0.000746,0.00075
