/
helpers.py
189 lines (155 loc) · 7.76 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
Various helper functions.
"""
from copy import deepcopy
import os
from typing import Union
import joblib
import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.pipeline import Pipeline
def make_directories_if_not_exists(directories_list: list) -> None:
"""
Makes directories in the current working directory if they do not exist.
:param directories_list: list of directories to create
"""
for directory in directories_list:
if not os.path.exists(directory):
os.makedirs(directory)
def drop_features(df: Union[pd.DataFrame, np.ndarray], features_list: list) -> pd.DataFrame:
"""
Drops features (columns) from a dataframe.
:param df: pandas dataframe
:param features_list: list of features (columns) to drop
:return: dataframe with dropped columns
"""
df = df.drop(features_list, axis=1, errors='ignore')
return df
def save_cv_scores(df: pd.DataFrame, model_uid: str) -> None:
"""
Saves cross-validation scores into a model-specific directory.
:param df: dataframe of cross-validation scores
:param model_uid: model uid
"""
save_directory = os.path.join('modeling', 'model_results', model_uid, 'cv_scores')
make_directories_if_not_exists([save_directory])
df.to_csv(os.path.join(save_directory, 'cv_scores.csv'), index=False)
def save_model(model: Union[RegressorMixin, ClassifierMixin], model_uid: str,
model_append_name: str = None, ) -> None:
"""
Saves a pickled model into a model-specific directory.
:param model: regression or classification model
:param model_uid: model uid
:param model_append_name: name to append to the model name
"""
model_name = 'model'
if model_append_name:
model_name = f'{model_name}_{model_append_name}'
save_directory = os.path.join('modeling', 'model_results', model_uid, 'model')
make_directories_if_not_exists([save_directory])
joblib.dump(model, os.path.join(save_directory, f'{model_name}.pkl'))
def save_modeling_data_in_model_directory(x_train: pd.DataFrame, y_train: pd.Series, x_test: pd.DataFrame,
y_test: pd.Series, model_uid: str) -> None:
"""
Saves modeling data into a model-specific directory.
:param x_train: x train
:param y_train: y train
:param x_test: x test
:param y_test: y test
:param model_uid: model uid
"""
save_directory = os.path.join('modeling', 'model_results', model_uid, 'data')
make_directories_if_not_exists([save_directory])
joblib.dump(x_train, os.path.join(save_directory, 'x_train.pkl'))
joblib.dump(y_train, os.path.join(save_directory, 'y_train.pkl'))
joblib.dump(x_test, os.path.join(save_directory, 'x_test.pkl'))
joblib.dump(y_test, os.path.join(save_directory, 'y_test.pkl'))
def create_data_splits_on_year(df: pd.DataFrame, test_year_start: int) -> tuple:
"""
Creates two splits of data: training and testing for model training and evaluation based on a year-wise split.
:param df: pandas dataframe on which we can subset based on a year column
:param test_year_start: year to start the testing data
:return: training dataframe and testing dataframe
"""
train_df = df.loc[df['year'] < test_year_start]
test_df = df.loc[df['year'] >= test_year_start]
return train_df, test_df
def create_x_y_split(df: pd.DataFrame, target: str) -> tuple:
"""
Creates an x-y split for modeling.
:param df: dataframe of modeling data
:param target: name of the target column
:return: x predictor dataframe and y target series
"""
y_series = df[target]
x_df = df.drop(target, axis=1)
return x_df, y_series
def prep_modeling_and_evaluation_data(df: pd.DataFrame, test_year_start: int, target: str) -> tuple:
"""
Prepares data for modeling, evaluation, and prediction by creating necessary target and predictor splits.
:param df: dataframe of modeling data
:param test_year_start: year to start the testing data
:param target: name of the target column
:return: x train, y train, x test, y test, and a prediction dataframe
"""
train_df, test_df = create_data_splits_on_year(df, test_year_start)
x_train, y_train = create_x_y_split(train_df, target)
x_test, y_test = create_x_y_split(test_df, target)
return x_train, y_train, x_test, y_test
def create_custom_ts_cv_splits(df: pd.DataFrame, start_year: int, end_year: int, cv_folds: int) -> list:
"""
Creates a set of custom cross validation splits based on year. The function takes a start year and an end year along
with a number of cv folds. Based on the number of years between the provided years, it will create an equal number
of array splits based on cv folds. The folds are arranged as a time-series cross validation problem. That is,
in the first split, the first split is the training data and the second split is the testing data. In the second
split, the first two splits are the training data, and the third split is the testing data. And so on.
:param df: pandas dataframe of training data
:param start_year: start year of the cross validation folds
:param end_year: end year of the cross validation folds
:param cv_folds: number of cv folds
:return: list of tuples, with each tuple containing two items - the first is the index of the training observations
and the second is the index of the testing observations
"""
cv_splits = []
years = list(np.arange(start_year, end_year + 1, 1))
year_splits = np.array_split(years, cv_folds)
for n, year_split in enumerate(year_splits):
if n != cv_folds - 1:
train_ids = year_splits[:n + 1]
train_ids = np.concatenate(train_ids)
test_ids = year_splits[n + 1]
train_indices = df.loc[df['year'].isin(train_ids)].index.values.astype(int)
test_indices = df.loc[df['year'].isin(test_ids)].index.values.astype(int)
cv_splits.append((train_indices, test_indices))
return cv_splits
def transform_data_with_pipeline(pipeline: Pipeline, x_df: pd.DataFrame) -> pd.DataFrame:
"""
Transforms x_df with the pre-processing steps defined in the pipeline.
:param pipeline: scikit-learn pipeline
:param x_df: x predictor dataframe for transformation
:return: transformed x dataframe
"""
pipeline_ = deepcopy(pipeline)
pipeline_.steps.pop(len(pipeline_) - 1)
x_df = pipeline_.transform(x_df)
return x_df
def create_feature_bins(df: pd.DataFrame, pct_of_season_played_bins: list, winning_percentage_bins: list):
"""
Creates bins for a two pre-defined features: games_played and winning_percentage. The function also creates
a third new features that concatenates games_played_bin and winning_percentage_bin
:param df: modeling dataframe
:param pct_of_season_played_bins: bins for percent of season played
:param winning_percentage_bins: bins for winning percentage
:return: dataframe with two new binned features: games_played_bin and winning_percentage_bin and a concatenation of
the two called games+win_pct
"""
df_copy = deepcopy(df)
df_copy['pct_of_season_played_bin'] = pd.cut(df_copy['pct_of_season_played'], bins=pct_of_season_played_bins,
include_lowest=True)
df_copy['pct_of_season_played_bin'] = df_copy['pct_of_season_played_bin'].astype(str)
df_copy['winning_percentage_bin'] = pd.cut(df_copy['winning_percentage'], bins=winning_percentage_bins,
include_lowest=True)
df_copy['winning_percentage_bin'] = df_copy['winning_percentage_bin'].astype(str)
df_copy['games_and_win_pct'] = df_copy['pct_of_season_played_bin'] + '_' + df_copy['winning_percentage_bin']
return df_copy