## Introduction

This notebook looks at the relative importance of normalised protein expression by Updrs using:

* Stepwise Regression
* Decision Trees
* Random Forests
* Mutual Information Feature Selection
* Recursive Feature Elimination (RFE)
* XGBoost Regression Feature Importance
* Permutation Feature Importance

The results are summarised in /kaggle/input/amp-pd-protein-importance

In [2]:
import os
import sys

import shutil
import random
import warnings

warnings.filterwarnings("ignore") # , category=UserWarning

from decimal import *
from pathlib import Path
import glob

from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputRegressor
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.inspection import permutation_importance

import statsmodels.api as sm
from stepwise_regression import step_reg
import xgboost as xgb
from xgboost import XGBRegressor

import matplotlib.pyplot as plt
import seaborn as sns

from fastai.data.all import *
from fastai.tabular.all import *
from fastai.vision.all import *
from fastai.callback.fp16 import *
from fastai.vision.widgets import *

In [3]:
from scipy.cluster import hierarchy as hc

def cluster_columns(df, figsize=(10,6), font_size=12):
    corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
    corr_condensed = hc.distance.squareform(1-corr)
    z = hc.linkage(corr_condensed, method='average')
    fig = plt.figure(figsize=figsize)
    hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=font_size)
    plt.show()

In [4]:
os.chdir(Path.cwd().parent.parent)

In [5]:
test_clinical=pd.read_csv('amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')


train_clinical=pd.read_csv('amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
train_clinical_drop=train_clinical.drop('upd23b_clinical_state_on_medication',axis=1)

test_proteins=pd.read_csv('amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')
train_proteins=pd.read_csv('amp-parkinsons-disease-progression-prediction/train_proteins.csv')

train_proteins['visit_month']=train_proteins['visit_month'].astype(str)
train_proteins['patient_id']=train_proteins['patient_id'].astype(str)

test_peptides=pd.read_csv('amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')
train_peptides=pd.read_csv('amp-parkinsons-disease-progression-prediction/train_peptides.csv')

In [6]:
proteins_wide=pd.pivot(train_proteins, index=['visit_id','visit_month','patient_id'],columns='UniProt',values='NPX')
proteins_wide=proteins_wide.rename_axis(None, axis=1).reset_index()

proteins_wide=proteins_wide.fillna(proteins_wide.median())

train_clinical_drop['visit_month']=train_clinical_drop['visit_month'].astype(str)
train_clinical_drop['patient_id']=train_clinical_drop['patient_id'].astype(str)

train_clinical_proteins= pd.merge(train_clinical_drop,proteins_wide,how='left',on=['visit_id','patient_id','visit_month'])
#print(train_clinical_proteins.shape)
#train_clinical_proteins.head()

## Updrs1

In [None]:
df_updrs_1=train_clinical_proteins
df_updrs_1=df_updrs_1.dropna(subset=['Q99435', 'Q99674', 'Q99683', 'Q99829', 'Q99832'])
df_updrs_1=df_updrs_1.drop(['visit_id','patient_id','visit_month','updrs_2','updrs_3','updrs_4'],axis=1)
df_updrs_1=df_updrs_1.fillna(proteins_wide.median())
#print(df_updrs_1.shape)
#df_updrs_1.head()

In [None]:
X=df_updrs_1.loc[:,'O14773']
y=df_updrs_1.iloc[:,0]
#plt.scatter(X, y)
#plt.show()

## Feature Importance

#### [Why, How and When to apply Feature Selection. (Sudharsan Asaithambi)](https://towardsdatascience.com/why-how-and-when-to-apply-feature-selection-e9c69adfabf2#:~:text=Tree%20based%20models%20calculates%20feature,calculating%20the%20best%20predictive%20feature.&text=The%20feature%20importance%20in%20tree,Entropy%20or%20Chi-Square%20value.)

#### [How to Calculate Feature Importance With Python. (Jason Brownlee)](https://machinelearningmastery.com/calculate-feature-importance-with-python/)

#### [Jason Brownlee](https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/#:~:text=Feature%20selection%20is%20the%20process,the%20performance%20of%20the%20model.)
![](https://machinelearningmastery.com/wp-content/uploads/2019/11/Overview-of-Feature-Selection-Techniques3.png)


In [None]:
X=df_updrs_1.drop('updrs_1', axis=1)
y=df_updrs_1.iloc[:,0]

# Train/test set generation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)


### Use stepwise regession to identify significant proteins

**[A Convenient Stepwise Regression Package to Help You Select Features in Python](https://medium.com/@shouke.wei/a-convenient-stepwise-regression-package-in-python-1b75e4c0d04e)**

In [None]:
# = sm.add_constant(X)

model = sm.OLS(y, X)

backselect = step_reg.backward_regression(X, y, 0.05,verbose=False)
proteins_sig=pd.DataFrame(backselect,columns=['updrs_1'])

In [None]:
#proteins_sig.to_csv('stepwise.csv')

#### [The easiest way for getting feature names after running SelectKBest in Scikit Learn](https://stackoverflow.com/questions/39839112/the-easiest-way-for-getting-feature-names-after-running-selectkbest-in-scikit-le)

In [None]:
fs = SelectKBest(score_func=f_regression, k=40)
X_new = fs.fit_transform(X, y)
names = X.columns.values[fs.get_support()]
scores = fs.scores_[fs.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'feat_names'], ascending = [False, True])
#ns_df_sorted

In [None]:
ns_df_sorted.to_csv('select_k_best.csv')

In [None]:
ns_df_sorted.plot.bar(x='feat_names',y='F_Scores',figsize=(15,8))
#plt.show()

### Creating a Decision Tree
[09_tabular](https://github.com/fastai/fastbook/blob/master/09_tabular.ipynb)<br>
[How to Calculate Feature Importance With Python](https://machinelearningmastery.com/calculate-feature-importance-with-python/)

In [None]:
X=df_updrs_1.drop('updrs_1', axis=1)
y=df_updrs_1.iloc[:,0]

# Train/test set generation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

In [None]:
m=DecisionTreeRegressor(max_leaf_nodes=20, min_samples_leaf=25)
m.fit(X, y)

fn = X.columns.tolist()

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val

In [None]:
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
#feat_df.head()

In [None]:
feat_df

#feat_df.to_csv('decision_tree.csv')

In [None]:
plt.figure(figsize=(20,12))

#tree.plot_tree(m,
#               feature_names=fn,
#               filled=true)

**Decision Tree is very sensitive to initial conditions and value of "random_state" above**

In [None]:
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()),6)
def m_rmse(m, X, y): return r_mse(m.predict(X),y)

print(m_rmse(m,X, y))
print(m.get_n_leaves(),len(X))
print(m_rmse(m,X, y))

### Random Forest

In [None]:
def rf(X, y, n_estimators=50, max_samples=100,
       max_features=0.5, min_samples_leaf=25, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(X, y)

In [None]:
m=rf(X,y);

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_rf_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
#feat_rf_df

In [None]:
#feat_rf_df.to_csv('random_forest.csv')

In [None]:
m_rmse(m,X,y), m_rmse(m,X, y)

### Feature importance
(returns the same result as Random Forest)

In [None]:
def rf_feat_importance(m,df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp',ascending=False)

In [None]:
fi=rf_feat_importance(m,X)
# fi[:20]

In [None]:
fi.to_csv('feature_importance.csv')

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

# plot_fi(fi[:20]);

### Removing low importance proteins

In [None]:
to_keep=fi[fi.imp>0.015].cols
len(to_keep)

In [None]:
xs_imp = X[to_keep]
valid_xs_imp = X[to_keep]

In [None]:
m = rf(xs_imp, y)

### Removing redundant features

In [None]:
# cluster_columns(xs_imp)

### Mutual Information Feature Selection

In [None]:
fs = SelectKBest(score_func=mutual_info_regression, k=40)
X_new = fs.fit_transform(X, y)
names = X.columns.values[fs.get_support()]
scores = fs.scores_[fs.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['proteins', 'MI_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['MI_Scores', 'proteins'], ascending = [False, True])
# ns_df_sorted

In [None]:
#ns_df_sorted.to_csv('mutual_information.csv')

In [None]:
ns_df_sorted.plot.bar(x='proteins',y='MI_Scores',figsize=(15,8))
# plt.show()

### [Feature Selection with Recursive Feature Elimination (RFE)](https://www.kaggle.com/code/julianmacnamara/feature-selection/edit)

In [None]:
X=df_updrs_1.drop('updrs_1', axis=1)
y=df_updrs_1.iloc[:,0]

# Train/test set generation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

#### [NumPy: ravel() function](https://www.w3resource.com/numpy/manipulation/ravel.php)

In [None]:
# Scale train and test sets with StandardScaler
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

# Fix the dimensions of the target array
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Init, fit, test Lasso Regressor
forest = RandomForestRegressor()
_ = forest.fit(X_train_std, y_train.ravel())
forest.score(X_test_std, y_test)

protein_feature_complete = pd.DataFrame(
    zip(X_train.columns, abs(forest.feature_importances_)),
    columns=["feature", "weight"],
).sort_values("weight").reset_index(drop=True)

# Init the transformer
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=20, step=10)

# Fit to the training data
_ = rfe.fit(X_train_std, y_train.ravel())

In [None]:
# X_train.loc[:, rfe.support_]

In [None]:
df_updrs_1_pro=X_train.loc[:, rfe.support_]

In [None]:
# Init, fit, score
forest = RandomForestRegressor()
_ = forest.fit(rfe.transform(X_train_std), y_train.ravel())
forest.score(rfe.transform(X_test_std), y_test)

In [None]:
# df_updrs_1_pro.head()

In [None]:
list_updrs_1_pro=df_updrs_1_pro.columns.to_list()

In [None]:
list_updrs_1_pro=pd.Series(list_updrs_1_pro)
# list_updrs_1_pro

In [None]:
#list_updrs_1_pro.to_csv('recursive_feature_elimination.csv')

### XGBoost Regression Feature Importance

In [None]:
X=df_updrs_1.drop('updrs_1', axis=1)
y=df_updrs_1.iloc[:,0]

# Train/test set generation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

In [None]:
# Scale train and test sets with StandardScaler
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

# Fix the dimensions of the target array
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [None]:
m = XGBRegressor()
m.fit(X, y)

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val

In [None]:
feat_xgb_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
# feat_xgb_df

In [None]:
#feat_xgb_df.to_csv('xgb.csv')

### Permutation Feature Importance for Regression

[Permutation feature importance](https://scikit-learn.org/stable/modules/permutation_importance.html) is a technique for calculating relative importance scores that is independent of the model used.

First, a model is fit on the dataset, such as a model that does not support native feature importance scores. Then the model is used to make predictions on a dataset, although the values of a feature (column) in the dataset are scrambled. This is repeated for each feature in the dataset. Then this whole process is repeated 3, 5, 10 or more times. The result is a mean importance score for each input feature (and distribution of scores given the repeats).

This approach can be used for regression or classification and requires that a performance metric be chosen as the basis of the importance score, such as the mean squared error for regression and accuracy for classification.

Permutation feature selection can be used via the permutation_importance() function that takes a fit model, a dataset (train or test dataset is fine), and a scoring function.

In [None]:
m = KNeighborsRegressor()
m.fit(X, y)
results = permutation_importance(m, X, y, scoring='neg_mean_squared_error')
importance = results.importances_mean

feat_dict={}
for col, val in sorted(zip(X.columns, importance), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val

In [None]:
feat_pfi_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
# feat_pfi_df

In [None]:
#feat_pfi_df.to_csv('permutation_feature.csv')

## Updrs_2

In [None]:
df_updrs_2=train_clinical_proteins
df_updrs_2=df_updrs_2.dropna(subset=['Q99435', 'Q99674', 'Q99683', 'Q99829', 'Q99832'])
df_updrs_2=df_updrs_2.drop(['visit_id','patient_id','visit_month','updrs_1','updrs_3','updrs_4'],axis=1)
df_updrs_2=df_updrs_2.fillna(proteins_wide.median())
print(df_updrs_2.shape)
df_updrs_2.head()

In [None]:
X=df_updrs_2.drop('updrs_2', axis=1)
y=df_updrs_2.iloc[:,0]

# Train/test set generation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

### Stepwise regession

In [None]:
# = sm.add_constant(X)

model = sm.OLS(y, X)

backselect = step_reg.backward_regression(X, y, 0.05,verbose=False)
proteins_sig=pd.DataFrame(backselect,columns=['updrs_2'])

# proteins_sig.to_csv('stepwise.csv')

### SelectKBest

In [None]:
fs = SelectKBest(score_func=f_regression, k=40)
X_new = fs.fit_transform(X, y)
names = X.columns.values[fs.get_support()]
scores = fs.scores_[fs.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'feat_names'], ascending = [False, True])

# ns_df_sorted.to_csv('select_k_best.csv')

### Decision Tree

In [None]:
m=DecisionTreeRegressor(max_leaf_nodes=20, min_samples_leaf=25)
m.fit(X, y)

fn = X.columns.tolist()

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

# feat_df.to_csv('decision_tree.csv')

### Random Forest

In [None]:
def rf(X, y, n_estimators=50, max_samples=100,
       max_features=0.5, min_samples_leaf=25, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(X, y)

m=rf(X,y);

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_rf_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

# feat_rf_df.to_csv('random_forest.csv')

### Mutual Information Feature Selection

In [None]:
fs = SelectKBest(score_func=mutual_info_regression, k=40)
X_new = fs.fit_transform(X, y)
names = X.columns.values[fs.get_support()]
scores = fs.scores_[fs.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['proteins', 'MI_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['MI_Scores', 'proteins'], ascending = [False, True])

# ns_df_sorted.to_csv('mutual_information.csv')

### Recursive Feature Elimination (RFE)

In [None]:
# Scale train and test sets with StandardScaler
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

# Fix the dimensions of the target array
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Init, fit, test Lasso Regressor
forest = RandomForestRegressor()
_ = forest.fit(X_train_std, y_train.ravel())
forest.score(X_test_std, y_test)

protein_feature_complete = pd.DataFrame(
    zip(X_train.columns, abs(forest.feature_importances_)),
    columns=["feature", "weight"],
).sort_values("weight").reset_index(drop=True)

# Init the transformer
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=20, step=10)

# Fit to the training data
_ = rfe.fit(X_train_std, y_train.ravel())

df_X_pro=X_train.loc[:, rfe.support_]
list_X_pro=df_X_pro.columns.to_list()
list_X_pro=pd.Series(list_X_pro)

# list_X_pro.to_csv('recursive_feature_elimination.csv')

### XGBoost Regression Feature Importance

In [None]:
m = XGBRegressor()
m.fit(X, y)

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_xgb_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

# feat_xgb_df.to_csv('xgb.csv')

### Permutation Feature Importance for Regression

In [None]:
m = KNeighborsRegressor()
m.fit(X, y)
results = permutation_importance(m, X, y, scoring='neg_mean_squared_error')
importance = results.importances_mean

feat_dict={}
for col, val in sorted(zip(X.columns, importance), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_pfi_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

# feat_pfi_df.to_csv('permutation_feature.csv')

## Updrs_3

In [None]:
df_updrs_3=train_clinical_proteins
df_updrs_3=df_updrs_3.dropna(subset=['Q99435', 'Q99674', 'Q99683', 'Q99829', 'Q99832'])
df_updrs_3=df_updrs_3.dropna(subset=['updrs_3'], axis = 0)
df_updrs_3=df_updrs_3.drop(['visit_id','patient_id','visit_month','updrs_1','updrs_2','updrs_4'],axis=1)
df_updrs_3=df_updrs_3.fillna(proteins_wide.median())
print(df_updrs_3.shape)
df_updrs_3.head()

In [None]:
X=df_updrs_3.drop('updrs_3', axis=1)
y=df_updrs_3.iloc[:,0]

# Train/test set generation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

### Stepwise

In [None]:
# = sm.add_constant(X)

model = sm.OLS(y, X)

backselect = step_reg.backward_regression(X, y, 0.05,verbose=False)
proteins_sig=pd.DataFrame(backselect,columns=['updrs_3'])

# proteins_sig.to_csv('stepwise.csv')

### SelectKBest

In [None]:
fs = SelectKBest(score_func=f_regression, k=40)
X_new = fs.fit_transform(X, y)
names = X.columns.values[fs.get_support()]
scores = fs.scores_[fs.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'feat_names'], ascending = [False, True])

# ns_df_sorted.to_csv('select_k_best.csv')

### Decision Tree

In [None]:
m=DecisionTreeRegressor(max_leaf_nodes=20, min_samples_leaf=25)
m.fit(X, y)

fn = X.columns.tolist()

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

#feat_df.to_csv('decision_tree.csv')

### Random Forest

In [None]:
def rf(X, y, n_estimators=50, max_samples=100,
       max_features=0.5, min_samples_leaf=25, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(X, y)

m=rf(X,y);

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_rf_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

#feat_rf_df.to_csv('random_forest.csv')

### Mutual Information

In [None]:
fs = SelectKBest(score_func=mutual_info_regression, k=40)
X_new = fs.fit_transform(X, y)
names = X.columns.values[fs.get_support()]
scores = fs.scores_[fs.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['proteins', 'MI_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['MI_Scores', 'proteins'], ascending = [False, True])

#ns_df_sorted.to_csv('mutual_information.csv')

### Recursive Feature Elimination

In [None]:
# Scale train and test sets with StandardScaler
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

# Fix the dimensions of the target array
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Init, fit, test Lasso Regressor
forest = RandomForestRegressor()
_ = forest.fit(X_train_std, y_train.ravel())
forest.score(X_test_std, y_test)

protein_feature_complete = pd.DataFrame(
    zip(X_train.columns, abs(forest.feature_importances_)),
    columns=["feature", "weight"],
).sort_values("weight").reset_index(drop=True)

# Init the transformer
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=20, step=10)

# Fit to the training data
_ = rfe.fit(X_train_std, y_train.ravel())

df_X_pro=X_train.loc[:, rfe.support_]
list_X_pro=df_X_pro.columns.to_list()
list_X_pro=pd.Series(list_X_pro)

#list_X_pro.to_csv('recursive_feature_elimination.csv')

### XGBoost

In [None]:
m = XGBRegressor()
m.fit(X, y)

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_xgb_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

#feat_xgb_df.to_csv('xgb.csv')

### Permutation Feature Importance

In [None]:
m = KNeighborsRegressor()
m.fit(X, y)
results = permutation_importance(m, X, y, scoring='neg_mean_squared_error')
importance = results.importances_mean

feat_dict={}
for col, val in sorted(zip(X.columns, importance), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_pfi_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

#feat_pfi_df.to_csv('permutation_feature.csv')

## Updrs_4

In [None]:
df_updrs_4=train_clinical_proteins
df_updrs_4=df_updrs_4.dropna(subset=['Q99435', 'Q99674', 'Q99683', 'Q99829', 'Q99832'])
df_updrs_4=df_updrs_4.dropna(subset=['updrs_4'], axis = 0)
df_updrs_4=df_updrs_4.drop(['visit_id','patient_id','visit_month','updrs_1','updrs_2','updrs_3'],axis=1)
df_updrs_4=df_updrs_4.fillna(proteins_wide.median())
print(df_updrs_4.shape)
df_updrs_4.head()

In [None]:
X=df_updrs_4.drop('updrs_4', axis=1)
y=df_updrs_4.iloc[:,0]

# Train/test set generation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=15
)

### Stepwise

In [None]:
# = sm.add_constant(X)

model = sm.OLS(y, X)

backselect = step_reg.backward_regression(X, y, 0.05,verbose=False)
proteins_sig=pd.DataFrame(backselect,columns=['updrs_4'])

proteins_sig.to_csv('stepwise.csv')

### SelectKBest

In [None]:
fs = SelectKBest(score_func=f_regression, k=40)
X_new = fs.fit_transform(X, y)
names = X.columns.values[fs.get_support()]
scores = fs.scores_[fs.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'feat_names'], ascending = [False, True])

ns_df_sorted.to_csv('select_k_best.csv')

### Decision Tree

In [None]:
m=DecisionTreeRegressor(max_leaf_nodes=20, min_samples_leaf=25)
m.fit(X, y)

fn = X.columns.tolist()

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

feat_df.to_csv('decision_tree.csv')

### Random Forest

In [None]:
def rf(X, y, n_estimators=50, max_samples=100,
       max_features=0.5, min_samples_leaf=25, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(X, y)

m=rf(X,y);

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_rf_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

feat_rf_df.to_csv('random_forest.csv')

### Mutual Information

In [None]:
fs = SelectKBest(score_func=mutual_info_regression, k=40)
X_new = fs.fit_transform(X, y)
names = X.columns.values[fs.get_support()]
scores = fs.scores_[fs.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['proteins', 'MI_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['MI_Scores', 'proteins'], ascending = [False, True])

ns_df_sorted.to_csv('mutual_information.csv')

### Recursive Feature Elimination

In [None]:
# Scale train and test sets with StandardScaler
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

# Fix the dimensions of the target array
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Init, fit, test Lasso Regressor
forest = RandomForestRegressor()
_ = forest.fit(X_train_std, y_train.ravel())
forest.score(X_test_std, y_test)

protein_feature_complete = pd.DataFrame(
    zip(X_train.columns, abs(forest.feature_importances_)),
    columns=["feature", "weight"],
).sort_values("weight").reset_index(drop=True)

# Init the transformer
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=20, step=10)

# Fit to the training data
_ = rfe.fit(X_train_std, y_train.ravel())

df_X_pro=X_train.loc[:, rfe.support_]
list_X_pro=df_X_pro.columns.to_list()
list_X_pro=pd.Series(list_X_pro)

list_X_pro.to_csv('recursive_feature_elimination.csv')

### XGBoost

In [None]:
m = XGBRegressor()
m.fit(X, y)

feat_dict={}
for col, val in sorted(zip(X_train.columns, m.feature_importances_), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_xgb_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

feat_xgb_df.to_csv('xgb.csv')

### Permutation Feature Importance

In [None]:
m = KNeighborsRegressor()
m.fit(X, y)
results = permutation_importance(m, X, y, scoring='neg_mean_squared_error')
importance = results.importances_mean

feat_dict={}
for col, val in sorted(zip(X.columns, importance), key=lambda x:x[1],reverse=True):
    feat_dict[col]=val
    
feat_pfi_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})

feat_pfi_df.to_csv('permutation_feature.csv')