# Get Validation Data for ML models

## Import libs

In [1]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from src import algorithms, stats_scraper

In [2]:
def get_validation_data(path):
    """
    Gets the boxscore data from 2008 up to 2015
    """
    
    kwargs = {'DateFrom': '',
              'DateTo': ''}

    list_df = []
    for i in range(8, 15):
        kwargs['Season'] = f'20{i:02}-{i+1:02}'
        print(f'Getting data from Season: {kwargs["Season"]}')
        output = os.path.join(path, f'teams_boxscore_trad_2k{i}_first_half.csv')
        tmp_df = stats_scraper.web_scraper(output,training_dataset=True, **kwargs)
        list_df.append(tmp_df)
    data = pd.concat(list_df, axis=0, ignore_index=True)

    return data

## Data Pipeline


In [3]:
path = os.path.join(os.getcwd(),'Validation_data')
algorithms.check_dir(path)

data = get_validation_data(path)
data = algorithms.pre_process_cols(data)
data = algorithms.hollinger_formula(data)
data = algorithms.concat_home_away_stats(data)
data = algorithms.get_dummies(data)
data = algorithms.feature_eng(data)
data = algorithms.clean_data(data)

/Users/dimitrisglenis/Documents/NBA_predict_matches/NBA-mathc-predictor folder already exists.
Getting data from Season: 2008-09
Fetching Data
Done
Getting data from Season: 2009-10
Fetching Data
Done
Getting data from Season: 2010-11
Fetching Data
Done
Getting data from Season: 2011-12
Fetching Data
Done
Getting data from Season: 2012-13
Fetching Data
Done
Getting data from Season: 2013-14
Fetching Data
Done
Getting data from Season: 2014-15
Fetching Data
Done


## Save the data

In [4]:
output = os.path.join(os.getcwd(),'Validation_data', 'validation_data_first_half_2008_2015.csv')
algorithms.check_dir(output)

data.to_csv(output)

/Users/dimitrisglenis/Documents/NBA_predict_matches/NBA-mathc-predictor/Validation_data folder already exists.
