In [4]:
#!/usr/bin/env python3
from __future__ import absolute_import
import pandas as pd
import time

import logging
from utils.logger import logger
from constants import DATA_PATH, DF_DUMP_NAME, CLF_DUMP_NAME, build_constants
import train_and_test
import model_pipeline
import utils.clean_data
import utils.persistence
import utils.normalize_data
import utils.get_data_subset

In [5]:
TARGET_COL = 'genre'
SAMPLE_COL = 'lyrics'

In [6]:
start_time = time.time()
logger.setLevel(logging.DEBUG)
build_constants()
logger.info("Started")

[2017-08-02 18:49:36,563] - [DEBUG] - [corpora build] - Executing function add_custom_corpus
[2017-08-02 18:49:38,875] - [DEBUG] - [corpora build] - Added 1614144 more words. Total is 1614144
[2017-08-02 18:49:38,875] - [DEBUG] - [corpora build] - Executing function add_nltk_words
[2017-08-02 18:49:39,438] - [DEBUG] - [corpora build] - Added 209726 more words. Total is 1823870
[2017-08-02 18:49:39,720] - [INFO] - [<ipython-input-6-1aaa3d1d4eef> <module>] - Started


In [7]:
df = pd.read_json(path_or_buf=DATA_PATH, orient='records', encoding="UTF8")
logger.debug("Loaded {} rows into df".format(len(df)))

df = utils.get_data_subset.crop(df, None, None)
df = utils.get_data_subset.filter_rows_by_string(df,
                                                 [TARGET_COL],
                                                 ['Rock',
                                                  'Hip Hop'])
df = utils.clean_data.execute_cleaners(df)
df = utils.normalize_data.normalize_genres(df, TARGET_COL)
X, y = utils.get_data_subset.get_x_y(df, SAMPLE_COL, TARGET_COL)

[2017-08-02 18:49:45,954] - [DEBUG] - [<ipython-input-7-942f9752e7af> <module>] - Loaded 66745 rows into df
[2017-08-02 18:49:45,954] - [DEBUG] - [get_data_subset filter_rows_by_string] - Row count is 66745
[2017-08-02 18:49:45,970] - [DEBUG] - [get_data_subset filter_rows_by_string] - Row count is 13425 (Filtered out 53320 rows)
[2017-08-02 18:49:46,049] - [DEBUG] - [clean_data execute_cleaners] - Row count is 13425
[2017-08-02 18:50:04,910] - [DEBUG] - [clean_data execute_cleaners] - Row count is 13425 (drop_by_language Cleaned 0 rows)
[2017-08-02 18:50:04,910] - [DEBUG] - [clean_data execute_cleaners] - Row count is 13425
[2017-08-02 18:50:04,926] - [DEBUG] - [clean_data execute_cleaners] - Row count is 13425 (drop_by_nas Cleaned 0 rows)
[2017-08-02 18:50:04,926] - [DEBUG] - [clean_data execute_cleaners] - Row count is 13425
[2017-08-02 18:50:05,051] - [DEBUG] - [clean_data execute_cleaners] - Row count is 13425 (drop_by_strings Cleaned 0 rows)
[2017-08-02 18:50:05,051] - [DEBUG] - 

In [8]:
clf = model_pipeline.get_pipeline(SAMPLE_COL)

train_and_test.test_using_kfold(X, y, clf)

logger.info("Finished in {0:.2f} seconds".format(time.time() - start_time))

[2017-08-02 18:53:42,969] - [INFO] - [train_and_test train_and_dump] - Fitting and transforming the model
[2017-08-02 18:53:42,969] - [INFO] - [base_feat_ext transform] - Extracting feature bad_words_ratio
[2017-08-02 18:53:49,266] - [INFO] - [base_feat_ext transform] - Extracting feature inner_capital_letters_ratio
[2017-08-02 18:53:56,954] - [INFO] - [base_feat_ext transform] - Extracting feature nature_words_ratio
[2017-08-02 18:54:03,331] - [INFO] - [base_feat_ext transform] - Extracting feature negative_words_ratio
[2017-08-02 18:54:09,736] - [INFO] - [base_feat_ext transform] - Extracting feature positive_words_ratio
[2017-08-02 18:54:15,926] - [INFO] - [base_feat_ext transform] - Extracting feature single_quote_ratio
[2017-08-02 18:54:16,035] - [INFO] - [base_feat_ext transform] - Extracting feature stanza_count
[2017-08-02 18:54:16,160] - [INFO] - [base_feat_ext transform] - Extracting feature stanza_len_avg
[2017-08-02 18:54:16,737] - [INFO] - [base_feat_ext transform] - Extra