In [6]:
import pandas as pd
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

In [2]:
# import sample data
my_df = pd.read_csv('feature_selection_sample_data.csv')
my_df.head()

Unnamed: 0,output,input1,input2,input3,input4
0,564,534,536,466,599
1,182,264,103,244,519
2,323,272,339,609,474
3,438,416,444,330,582
4,547,520,464,104,412


In [3]:
# seperate input and output variables
X = my_df.drop(['output'], axis = 1) # axis=1 drops column
y = my_df['output']

### Simple Training/Test Split

In [4]:
# create training and testing data for a regression model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

# create training and testing data for a classification model (not applicable to this example)
# adding stratify argument ensures both sets will contain the same proportion of the output class (y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

### Cross-Validation Training/Test Split

In [7]:
# instantiate regressor object for demonstration
regressor = LinearRegression()

In [13]:
# create cross validation scores and display r2 scores for a regression model
# these scores represent the r2 scores for each fold of validation (cv = 4)
cv_scores = cross_val_score(regressor, X, y, cv = 4, scoring = 'r2') # data is not shuffled with this method
cv_scores

array([0.78287124, 0.57838871, 0.45187443, 0.7384809 ])

In [14]:
# average (mean) cross-validation r2 score
cv_scores.mean()

0.6379038172153191

In [15]:
# alternate cross validation method that allows data shuffling prior to spit (preferred method)

# regression model (current example)
cv = KFold(n_splits = 4, shuffle = True)
cv_scores_shuffled = cross_val_score(regressor, X, y, cv = cv, scoring = 'r2')
cv_scores_shuffled

# classification model
# cv = StratifiedKFold(n_splits = 4, shuffle = True) # forces each fold to contain same proportion of output class
# cv_scores_shuffled = cross_val_score(classifier, X, y, cv = cv, scoring = 'accuracy') # classifier would need to be instantiated
# cv_scores_shuffled

array([0.78649763, 0.68120856, 0.78749098, 0.72818783])

In [16]:
# average (mean) cross-validation r2 score for shuffled data
cv_scores_shuffled.mean()

0.7458462518132294