In [64]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.feature_selection import RFE

# 1. Recursive Feature Elimination (RFE)
Using the MPG dataset, let's see if we could predict the optimal features for a linear regression model when trying to predict the "mpg" or "miles per gallon".
NOTE: Miles per gallon is the common way in the US to measure the distance that a car could go per amount of fuel.

In [2]:
# Upload the data
data = sns.load_dataset('mpg')

# Take a look at the data
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [3]:
# Look at the shape
data.shape

(398, 9)

In [4]:
# For the sake of this exercise, drop the "origin" and the "name" variables from our dataset
data = data.drop(["origin", "name"], axis=1)
data.dropna(axis=0, inplace=True)

In [5]:
# Once again, take a look at the new dataset
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


In [6]:
# Remember that the main thing that we need to do when using RFE is deciding how many features to use
from sklearn.model_selection import train_test_split

y = data.mpg
x = data.drop(["mpg"], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9)


### Let's do linear regression on all of the variables first so that we have a baseline to compare feature selection results with!

In [7]:
# Your code:
from sklearn.linear_model import LinearRegression

# Fit the model.
linear_model = LinearRegression().fit(x_train, y_train)

In [8]:
# Predict 
y_test_predict = linear_model.predict(x_test)

In [9]:
# Return the R2 score of the linear regression model.
linear_model.score(x_test, y_test)

0.8259555584834549

### Use RFE to check how many features to include

In [35]:
# Your code:
# Running the RFE with best 3 features to select
rfe = RFE(LinearRegression(), n_features_to_select=3).fit(x_train, y_train)

# Print out ranking of the features
print(rfe.ranking_)

# R2 score for the model with selected (3) features
rfe.score(x_test, y_test)



[1 2 4 3 1 1]


0.6994356944511378

### If we want to have an R2 value of above 80%, how many features do you think we should select?

Using your answer, run the RFE algorithm again and note down which features should be used.

In [34]:
# This line runs RFE with best 2, 3, 4 and 5 features and prints the R2 score for them
[print(f"R2 (n={n}) = {RFE(LinearRegression(), n_features_to_select=n).fit(x_train, y_train).score(x_test, y_test)}") for n in range(2,6)]

R2 (n=2) = 0.7000722849271235
R2 (n=3) = 0.6994356944511378
R2 (n=4) = 0.759098815033987
R2 (n=5) = 0.8260092069531465


[None, None, None, None]

# 2. Analysis of Variance (ANOVA)

Remember that while using ANOVA (as well as the Chi Squared Test and the Pearson Correlation Coefficient test), there is NO need to choose a machine learning model beforehand.

Knowing this, we're going to use the Iris dataset again to see if we could predict the categorical variable (species) using continuous variables, which is typically what ANOVA does.


In [36]:
from sklearn.feature_selection import SelectKBest, f_classif

# Load in the iris dataset.
data = sns.load_dataset('iris')
data.dropna(axis=0, inplace=True)
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [37]:
y = data.species
x = data.drop(["species"], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### Using only the top 2 features and the SelectKBest function, use the fit() function to run this test.
Remember that you should only use this method on the x_train and y_train datasets!

In [51]:
# Your code:
KB_selector = SelectKBest(k=2).fit(x_train, y_train)

array([102.09508257,  51.65845745, 971.72082671, 821.17156381])

In [100]:
# Print the three most relevant variables from your results above.

# Fitting the SelectKBest with k=3
KB_selector = SelectKBest(k=3, score_func=f_classif).fit(x_train, y_train)

# Creating a DataFrame with scores
results = pd.DataFrame(KB_selector.scores_,  index=x_train.columns, columns=["score"])

# Getting them sorted and printed Top 3
results.sort_values("score", ascending=False).head(3)


Unnamed: 0,score
petal_length,971.720827
petal_width,821.171564
sepal_length,102.095083


### Let's see how these three variables perform in a machine learning context.
Plug these three variables into an SVM using scikit-learn

In [91]:
# Your code:
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix

SVC_model = SVC().fit(KB_selector.transform(x_train), y_train)

In [88]:
# Get the accuracy score
SVC_model.score(KB_selector.transform(x_test), y_test)

0.9333333333333333

# 3. Chi-Squared Test
Now that we know how to do the ANOVA test, the Chi-Squared test is extremely similar in implementation. Remember that unlike the ANOVA test, the Chi-Squared Test should only be ran when you want to predict a categorical response with categorical columns!

With this in mind, let's use the titanic dataset again to see if we could predict the categorical variable "alive".

In [104]:
data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [105]:
# For the sake of this exercise, we're only going to use these 5 categorical columns.
data.dropna(axis=0, inplace=True)
data = data[["sex", "embarked", "adult_male", "alive", "alone"]]
data.head()

Unnamed: 0,sex,embarked,adult_male,alive,alone
1,female,C,False,yes,False
3,female,S,False,yes,False
6,male,S,True,no,True
10,female,S,False,yes,False
11,female,S,False,yes,True


### Transform the strings to number values.
Note that these number values could be literally anything and the actual value does not matter!


In [112]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import chi2

features = ["sex", "embarked", "adult_male", "alone"]
target = ["alive"]

X = data[features]
y = data[target]

X = OrdinalEncoder().fit_transform(X)
y = OrdinalEncoder().fit_transform(y)

### Out of these 5 variables, what are the two most important variables?

In [124]:
# Your code:
chi, p = chi2(X, y)

# Creating a DataFrame with scores
df_chi2 = pd.DataFrame(chi,  index=features, columns=["chi2"])

df_chi2.sort_values(by="chi2", ascending=False).head(2)

Unnamed: 0,chi2
adult_male,32.259252
sex,24.642385


In [126]:
# Print the two most relevant variables from your results above.

# Fitting the SelectKBest with k=2
KB_selector = SelectKBest(k=2, score_func=chi2).fit(X, y)

# Creating a DataFrame with scores
results = pd.DataFrame(KB_selector.scores_,  index=features, columns=["score"])

# Getting them sorted and printed Top 3
results.sort_values("score", ascending=False).head(2)

Unnamed: 0,score
adult_male,32.259252
sex,24.642385


# 4. Pearson Correlation Coefficient

Now that we tried both ANOVA and the Chi-Squared Test, let's see how the Pearson Correlation Coefficient works! For this, we're going to use the car_crashes dataset!

In [127]:
# Load in the dataset
data = sns.load_dataset('car_crashes')

# Drop the NA values
data.dropna(axis=0, inplace=True)

# # Drop the categorical column for this exercise
data = data.drop(['abbrev'], axis=1)

# View the data
data.head()

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63


In [128]:
# Split the dataset by using "total" as the response/dependent variable and the rest as independent variables.
y = data.total
x = data.drop(["total"], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [129]:
# Print the correlation matrix of x_train
x_train.corr()

Unnamed: 0,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses
speeding,1.0,0.695562,0.592521,0.596794,-0.02587,-0.029831
alcohol,0.695562,1.0,0.727611,0.79417,-0.160553,-0.056565
not_distracted,0.592521,0.727611,1.0,0.736233,-0.153648,-0.06893
no_previous,0.596794,0.79417,0.736233,1.0,-0.127456,0.059174
ins_premium,-0.02587,-0.160553,-0.153648,-0.127456,1.0,0.643236
ins_losses,-0.029831,-0.056565,-0.06893,0.059174,0.643236,1.0


### Name the pairs of variables that are correlated more than 72%

In [141]:
# Your code:
c_matrix = x_train.corr()>0.72

for c in c_matrix.columns:
    for i in c_matrix.index:
        if c_matrix[c][i]:
            if c!=i:
                print(f"{c}, {i}")

alcohol, not_distracted
alcohol, no_previous
not_distracted, alcohol
not_distracted, no_previous
no_previous, alcohol
no_previous, not_distracted


### Using your answer to above, notice that there are three variables that are correlated with one another. 
Choose the one that comes first when put in alphabetical order and remove the other two variables from the dataset. Print the new correlation matrix of x_train.

In [145]:
# Your code:
x_train.drop(columns=["not_distracted", "no_previous"]).corr()

Unnamed: 0,speeding,alcohol,ins_premium,ins_losses
speeding,1.0,0.695562,-0.02587,-0.029831
alcohol,0.695562,1.0,-0.160553,-0.056565
ins_premium,-0.02587,-0.160553,1.0,0.643236
ins_losses,-0.029831,-0.056565,0.643236,1.0
