# Excersice 4

Feature selection:

1. You need to load the UCI Arrhythmia dataset
https://archive.ics.uci.edu/ml/datasets/arrhythmia
* the problem is a multi-class classification

2. For comparing the result you need to use kNN (k=3), and compare based on the accuracy with the original dataset (with the all features).

2. Run three different types of feature selection methods (Univariate Statistics, model based, and Iterative Feature Selection).
* The original data set has 279 features.
* You need to run the methods for number_of_features= 20, 50, 100, 150, 200
* All the results should show in the same figure (At the end we have just one figure)

In [197]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import LeaveOneOut
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Import libraries
%matplotlib inline

plt.rcParams["figure.figsize"] = [16, 9]

In [198]:
# Load dataset
url = "https://people.arcada.fi/~martinel/"
df = pd.read_csv(url + 'arrhythmia.data', header=None)

# Print shape of our data
print(df.shape)

# Replace missing '?' values by NaN
df = df.replace('?', np.NaN)

# Summarize the rows with missing values for each column
for i in df.columns:
	# Count number of rows with missing values
	n_miss = df[[i]].isnull().sum()
	perc = n_miss / df.shape[0] * 100
	if(perc[i] > 0): print('Column %d is missing %d values (%.1f%%)' % (i, n_miss, perc))

(452, 280)
Column 10 is missing 8 values (1.8%)
Column 11 is missing 22 values (4.9%)
Column 12 is missing 1 values (0.2%)
Column 13 is missing 376 values (83.2%)
Column 14 is missing 1 values (0.2%)


In [199]:
# Drop column 13 as it's full of missing values
df.drop(columns=[13], inplace=True)

# Separate features and target
X = df.iloc[:, :-1].values
y = df.iloc[:,-1].values

# Replace missing values by the mean
# Define imputer
imputer = SimpleImputer(strategy='mean')
# Fit on the dataset
imputer.fit(X)
# Transform the dataset
X = imputer.transform(X)

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Scaling the data (MIN MAX Scaling)
print('Shape of train {}, shape of test {}'.format(X_train.shape, X_test.shape))
MinMax = MinMaxScaler(feature_range=(0,1))
X_train = MinMax.fit_transform(X_train)
X_test = MinMax.transform(X_test)

Shape of train (339, 278), shape of test (113, 278)


In [194]:
## We are creating a grid for which all n_neighbors values are to be used for cross validation
param_grid={'weights':['distance', 'uniform'], 'n_neighbors':range(1,100)}

## Using Grid search for exhaustive searching

grid_search = GridSearchCV( KNeighborsClassifier(),param_grid, cv = 10)
grid_search.fit(X_train, y_train)



GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 100),
                         'weights': ['distance', 'uniform']})

In [195]:
train_Rsquare = grid_search.score(X_train, y_train)
test_Rsquare = grid_search.score(X_test, y_test)
train_MSE = mean_squared_error(y_train, grid_search.predict(X_train))
test_MSE = mean_squared_error(y_test, grid_search.predict(X_test))
output = pd.DataFrame(index=None, columns=['model','train_Rsquare', 'test_Rsquare', 'train_MSE','test_MSE'])
output = output.append(pd.Series({'model':'KNN Classifier','train_Rsquare':train_Rsquare, 'test_Rsquare':test_Rsquare, 'train_MSE':train_MSE,'test_MSE':test_MSE}),ignore_index=True )
output

Unnamed: 0,model,train_Rsquare,test_Rsquare,train_MSE,test_MSE
0,KNN Classifier,1.0,0.548673,0.0,31.672566


In [201]:
from sklearn.feature_selection import VarianceThreshold

v_threshold = VarianceThreshold(threshold=0)
v_threshold.fit(X)
v_threshold.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [203]:

# ANOVA feature selection for numeric input and categorical output
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# generate dataset
X, y = make_classification(n_samples=100, n_features=20, n_informative=2)
# define feature selection
fs = SelectKBest(score_func=f_classif, k=20)
# apply feature selection
X_selected = fs.fit_transform(X, y)
print(X_selected.shape)

(100, 20)
