<a href="https://colab.research.google.com/github/pankajr141/experiments/blob/master/Reasoning/ExplanableAI/Key_Feature_Analysis_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [None]:
from sklearn import datasets
import pandas as pd
import numpy as np

dataset = datasets.load_wine()
X = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']
X

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [None]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

# Logistic Regression

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_std = scaler.fit_transform(X)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=10000, solver='liblinear')

clf.fit(X_std, y)

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score

print("precision_score:", precision_score(y, clf.predict(X), average='weighted'))
print("recall_score:", recall_score(y, clf.predict(X), average='weighted'))
print("f1_score:", f1_score(y, clf.predict(X), average='weighted'))

precision_score: 0.10986617851281404
recall_score: 0.33146067415730335
f1_score: 0.16503105295595696


## Odds Ratio

<b>Positive Coefficient: </b> Indicates that an increase in the predictor variable increases the log odds of the positive class. <br>
<b>Negative Coefficient: </b> Indicates that an increase in the predictor variable decreases the log odds of the positive class.

In [None]:
pd.set_option('display.float_format', lambda x: '%.9f' % x)

def get_logistic_imp_features(coef, features):
    df_ = pd.DataFrame(list(zip(features, coef)), columns=['feature', 'coeff'])
    df_['coeff'] = df_['coeff'].apply(lambda x: x[0])
    df_['odds_ratio'] = df_['coeff'].apply(lambda x: np.exp(x))

    df_ = df_.sort_values(by=['coeff'], ascending=False)
    return df_

get_logistic_imp_features(clf.coef_.T, X.columns)

Unnamed: 0,feature,coeff,odds_ratio
12,proline,1.871897289,6.500618257
0,alcohol,1.386708557,4.001657127
11,od280/od315_of_diluted_wines,1.015438902,2.760574754
2,ash,0.9408668,2.562201372
6,flavanoids,0.921045225,2.511914535
1,malic_acid,0.413267401,1.511749215
5,total_phenols,0.222000768,1.248572337
4,magnesium,0.118566687,1.125881954
9,color_intensity,0.027714771,1.028102398
10,hue,0.020500405,1.020711981


## Permutation Importance

In [None]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(clf, X_std, y, n_repeats=30, random_state=42, n_jobs=-1)
perm_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance Mean': perm_importance.importances_mean,
    'Importance Std': perm_importance.importances_std
})
print("\nPermutation Importance:")
print(perm_importance_df.sort_values(by='Importance Mean', ascending=False))


Permutation Importance:
                         Feature  Importance Mean  Importance Std
12                       proline      0.080337079     0.018126141
0                        alcohol      0.053745318     0.013427110
9                color_intensity      0.052996255     0.013505236
2                            ash      0.030524345     0.009252149
6                     flavanoids      0.027715356     0.008695271
3              alcalinity_of_ash      0.027528090     0.007991232
10                           hue      0.016479401     0.008449823
11  od280/od315_of_diluted_wines      0.008239700     0.004752276
7           nonflavanoid_phenols      0.006179775     0.002654953
1                     malic_acid      0.004119850     0.004571744
4                      magnesium      0.002247191     0.002752236
8                proanthocyanins      0.001685393     0.002955006
5                  total_phenols      0.000000000     0.000000000


## Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE

clf_rfe = LogisticRegression(max_iter=10000, solver='liblinear')
rfe = RFE(clf_rfe, n_features_to_select=5)
rfe.fit(X_std, y)

rfe_features = X.columns[rfe.support_]

print("\nSelected Features by RFE:")
print(rfe_features)


Selected Features by RFE:
Index(['alcohol', 'flavanoids', 'color_intensity', 'hue', 'proline'], dtype='object')
