# CSCE 623 SP 2020 Assignment 4
## LASTNAME, FIRSTNAME
## YYYYMMDD


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import copy


from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split


pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

import warnings
warnings.filterwarnings('ignore')
#warnings.filterwarnings(action='once')

# Part A:  Data setup and exploration

## STEP 1:  Load, clean, split, explore, and transform the data to prepare it for machine learning.  

In [None]:
df = pd.read_csv('ISLR_Hitters.csv', index_col=0).dropna()
df.index.name = 'Player'

In [None]:
# display the names of the features
df.info()

### Clean the data
1. Separate the prediction (y) from the features (X)
1. League, Division and NewLeague are categories which should be converted to dummy (on-hot) features

In [None]:
y = df.Salary

# create the dummy variables
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()  #confirm existence of dummies and auto-generated names

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables from categorical features
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')


#since each dummy includes only 2 categories, use a single category to encode all info
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)


X.info()  #confirm existence of dummies and auto-generated names


### Split data into test and non-test sets

In [None]:
X_nonTest, X_test, y_nonTest, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Explore the data & make prediction about features

## 1.0  Data exploration
Note that exploration can be done on the non-scaled data

In [None]:
X_nonTest.info()

In [None]:
X_nonTest.describe()

In [None]:
X_nonTest.head()

Confirm that the categorical variable conversion to 1-hot features worked - and explore

In [None]:
X_nonTest.head(5)

In [None]:
# when exploring relationship between features and predictors, use a combined set
xy_nontest = pd.concat([X_nonTest, y_nonTest], axis=1)

In [None]:
# NOTE - THIS MAKE TAKE A FEW MOMENTS TO CALCULATE & DISPLAY ON YOUR COMPUTER

#switch the below to True to show the pairsplot.
if False:
    sns.pairplot(xy_nontest)
    


In [None]:
#Correlation plot to help features which might be encoding the same information
# magnitude is important: features close to 1 or close to -1 are more correlated with each other

corr = xy_nontest.corr()
corr.style.background_gradient(cmap='coolwarm')

### Prediction of Top-6 Features
List the features that you think will be most important for predicting salary

STUDENT ANSWER:  INSERT YOUR HYPOTHESIS HERE 


END STUDENT ANSWER

### Scale all X data using nonTest data scaler
* Scale the data features since we dont want some features to affect the linear regressions differently just becasue they have different scales
* Fit the scaler to the non-test data then apply those fitted parameters to the test data to preserve the scaling without being influenced by any aspect of the test data

In [None]:
scaler = StandardScaler()
scaler.fit(X_nonTest) # once we fit a scaler to the non-test data it can be used later to scale the test data without looking at the test data
X_nonTestScaled = pd.DataFrame(scaler.transform(X_nonTest), columns=X_nonTest.columns)  #non-test data scaled using non-test data mean and var
X_testScaled = scaler.transform(X_test)   #test data scaled using non-test data mean and var (prevents "learning" about the test set)

print(X_nonTest.columns)

###  Explore, then Scale the Y data using a an exponential scaler
notice that the original Y values are very non-normal

In [None]:
plt.hist(y_nonTest,bins=25);

The response variable, Salary, was noticed to have a non-normal distribution. Using this heavily skewed distribution could negatively affect any model-fitting using a least-squares based regression due to the assumption of normality required on the error term. Therefore, a natural log transform was performed on Salary, yielding a distribution that, while not perfectly normal, is much less heavily skewed and more likely to not violate the normally distributed error assumption. 

In [None]:
y_nonTestScaled = np.log(y_nonTest)
plt.hist(y_nonTestScaled,bins=25);

Although the new Y values after scaling are not as skewed, they are still not "normally distributed"

Next, build a helper function to compute MSE on predictions in the logspace.  This MSE method will be used by our model while computing the optimization equiation in search for good parameters.
A transform to calculate the $MSE$ in the original dataspace is shown below - and all $MSE$ values are reported in this space.

In [None]:
def transform_mse(y_true, y_pred):
    ydiff = np.exp(y_true) - np.exp(y_pred)
    mse = np.dot(ydiff.T, ydiff)/len(ydiff)
    return mse

dataspace_mse = make_scorer(transform_mse, greater_is_better=False)  #this scorer can be used by the model

# Part B:  Best Subset Selection: Determining the *Best* model features for each size linear regression model

Note:  set the number of features to 3 here to reduce run time during testing.  Setting higher will take significantly longer.
setting runtimes will be longer than picking num_features=3 by approximately:
* 4: 4x longer
* 5: 12x longer
* 6: 28x longer

In [None]:
num_features = 3   #  Only increase this beyond 3 if you have time to wait

## STEP 2:  Write Function For Best-Subset

In [None]:
def bestSubset(X_nonTest,y_nonTest, k):
    kfeatures = []  #placeholder
    kMSE = 1e8  #placeholder
    
    ##########################
    # INSERT STUDENT CODE HERE
    
   
    # END STUDENT CODE
    ##########################
    
    return kfeatures, kMSE 

## STEP 3:  Execute best subset

In [None]:
##########################
# INSERT STUDENT CODE HERE


    
# END STUDENT CODE
##########################

## STEP 4:  Plot of Avg Crossval MSE for 6 best models with annotated best point

In [None]:
##########################
# INSERT STUDENT CODE HERE


# END STUDENT CODE
##########################

## STEP 5:  Discussion of best subset models

STUDENT RESPONSE BELOW:


END STUDENT REPONSE

## STEP 6:  Write Function For Forward Stepwise Subset Selection

In [None]:
def forwardStepwiseSubset(X_nonTest,y_nonTest, k):
    kfeatures = []  #placeholder
    kMSE = []  #placeholder
    
    ##########################
    # INSERT STUDENT CODE HERE
    
    
    # END STUDENT CODE
    ##########################
    
    return (kfeatures, kMSE) 

## STEP 7:  Execute Forward subset

In [None]:
##########################
# INSERT STUDENT CODE HERE




# END STUDENT CODE
##########################

## STEP 8:  Updated Plot of Avg Crossval MSE & best points

In [None]:
##########################
# INSERT STUDENT CODE HERE


# END STUDENT CODE
##########################

## STEP 9:  Discussion of best foward-stepwise models 

STUDENT RESPONSE BELOW:



END STUDENT REPONSE

## STEP 10: Discussion and comparison of subset selection methods

STUDENT RESPONSE BELOW:



END STUDENT REPONSE

## STEP 11: Write a function to execute LASSO

In [None]:
def LASSOSubset(X_nonTest,y_nonTest, k):
    #example of logarithmically spaced alphas for LASSO... may need tweaking
    alphas = np.logspace(-3, 10,num=300)
    kfeatures = [] #placeholder
    kMSE=np.inf  #placeholder
    kalpha = 0  #placeholder

    ##########################
    # INSERT STUDENT CODE HERE



    # END STUDENT CODE
    ##########################
        
    return (kfeatures, kMSE, kalpha)     

## STEP 12: Execute LASSO

In [None]:
##########################
# INSERT STUDENT CODE HERE

    
# END STUDENT CODE
##########################

## STEP 13: Updated Plot of Avg Crossval MSE & best points

In [None]:
##########################
# INSERT STUDENT CODE HERE


# END STUDENT CODE
##########################

## STEP 14: Discussion of best foward-stepwise models 

STUDENT RESPONSE BELOW:


END STUDENT REPONSE

## STEP 15: Answer Customer Questions

## a

STUDENT RESPONSE BELOW:


END STUDENT REPONSE

## b

STUDENT RESPONSE BELOW:



END STUDENT REPONSE

# END OF ASSIGNMENT