## Help Needed: Why is there no feature importance in the divorce dataset?

In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Test to Try and Fix Feature Importance issues


#### Initialize the Dataframe and Process the Data - Titanic Dataset

In [127]:
# Import titanic data
titanic = pd.read_csv('data/titanic.csv')

# Processing features:
# 1. Drop columns that are not useful
titanic_features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
titanic_df = titanic[titanic_features]
titanic_df['CabinLetter'] = titanic_df['Cabin'].str.slice(0,1)
X = titanic_df.drop('Cabin',axis=1)
X['CabinLetter'] = X['CabinLetter'].fillna("?")
X['Pclass'] = X['Pclass'].astype(str)
X['SibSp'] = X['SibSp'].astype(str)
X['Parch'] = X['Parch'].astype(str)
X = X.dropna()
t = titanic.loc[X.index, 'Survived']
# 2. Convert categorical features to dummy variables (one-hot encoding with pandas)
X = pd.get_dummies(X)
# Display summary statistics for the features
print(f"These are the summary statistics for the features:\n {X.describe()}")
print("--------------------------------------------------------------------")
print(f"This is the number of rows in the dataset: {len(X)}")
print("--------------------------------------------------------------------")
print(f"This is the number of unique values in each column:\n {X.nunique()}")

These are the summary statistics for the features:
               Age        Fare    Pclass_1    Pclass_2    Pclass_3  Sex_female  \
count  712.000000  712.000000  712.000000  712.000000  712.000000  712.000000   
mean    29.642093   34.567251    0.258427    0.242978    0.498596    0.363764   
std     14.492933   52.938648    0.438078    0.429183    0.500350    0.481420   
min      0.420000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%     20.000000    8.050000    0.000000    0.000000    0.000000    0.000000   
50%     28.000000   15.645850    0.000000    0.000000    0.000000    0.000000   
75%     38.000000   33.000000    1.000000    0.000000    1.000000    1.000000   
max     80.000000  512.329200    1.000000    1.000000    1.000000    1.000000   

         Sex_male     SibSp_0     SibSp_1     SibSp_2  ...  Embarked_S  \
count  712.000000  712.000000  712.000000  712.000000  ...  712.000000   
mean     0.636236    0.658708    0.257022    0.035112  ...    0.778090

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df['CabinLetter'] = titanic_df['Cabin'].str.slice(0,1)


#### Initialize the Dataframe and Process the Data - Divorce Dataset

Note: The divorce dataset has no missing values and there are exactly 5 unique values per feature (0, 1, 2, 3, 4) that correspond to the following:<br>
- 0 - Never
- 1 - Seldom
- 2 - Averagely
- 3 - Frequently
- 4 - Always

In [155]:
# Import divorce data
divorce = pd.read_csv('data/divorce_data.csv',sep=';')

# Processing features:
X = divorce.drop('Divorce',axis=1) # .dropna() - dataset has no missing values
t = divorce['Divorce']

# Display summary statistics for the features
print(f"These are the summary statistics for the features:\n {X.describe()}")
print("--------------------------------------------------------------------")
print(f"This is the number of rows in the dataset: {len(X)}")
print("--------------------------------------------------------------------")
print(f"This is the number of unique values in each column:\n {X.nunique()}")

These are the summary statistics for the features:
                Q1          Q2          Q3          Q4          Q5          Q6  \
count  170.000000  170.000000  170.000000  170.000000  170.000000  170.000000   
mean     1.776471    1.652941    1.764706    1.482353    1.541176    0.747059   
std      1.627257    1.468654    1.415444    1.504327    1.632169    0.904046   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
50%      2.000000    2.000000    2.000000    1.000000    1.000000    0.000000   
75%      3.000000    3.000000    3.000000    3.000000    3.000000    1.000000   
max      4.000000    4.000000    4.000000    4.000000    4.000000    4.000000   

               Q7          Q8          Q9         Q10  ...         Q45  \
count  170.000000  170.000000  170.000000  170.000000  ...  170.000000   
mean     0.494118    1.452941    1.458824    1.576471  ...    2.458824

#### Train-Test Split 

In [165]:
# Done with sklearn
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(X, t, test_size=0.2)

# Print the train X and y shapes
print(f"Train X shape: {train_X.shape}")
print(f"Train y shape: {train_y.shape}")

# Print the summary statistics of the training features and testing features
print(f"Train X summary statistics:\n {train_X.describe()}")
print("--------------------------------------------------------------------")
print(f"Test X summary statistics:\n {val_X.describe()}")

Train X shape: (136, 54)
Train y shape: (136,)
Train X summary statistics:
                Q1          Q2          Q3          Q4          Q5          Q6  \
count  136.000000  136.000000  136.000000  136.000000  136.000000  136.000000   
mean     1.794118    1.683824    1.786765    1.507353    1.558824    0.698529   
std      1.656045    1.484249    1.431953    1.520216    1.649982    0.863411   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
50%      2.000000    2.000000    2.000000    1.000000    1.000000    0.000000   
75%      3.000000    3.000000    3.000000    3.000000    3.000000    1.000000   
max      4.000000    4.000000    4.000000    4.000000    4.000000    4.000000   

               Q7          Q8          Q9         Q10  ...         Q45  \
count  136.000000  136.000000  136.000000  136.000000  ...  136.000000   
mean     0.455882    1.455882    1.470588    1

#### Adding PCA

Here we'll reduce the dimensionality of the feature dataset from 54 down to 20. This may improve the ability to get feature importance from the knn model. Note that this will reduce the ability to understand the reduced features since they will become combinations of varios questions. If one hot encoding were to be used as well, that would reduce the understandibility beyond a point of any knowledge gain.

In [166]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

train_X_pca = pca.fit_transform(train_X)
val_X_pca = pca.transform(val_X)

# Convert it back to DataFrame
pca_columns = [f'PCA_feature_{i+1}' for i in range(train_X_pca.shape[1])]
train_X_pca_df = pd.DataFrame(train_X_pca, columns=pca_columns)
val_X_pca_df = pd.DataFrame(val_X_pca, columns=pca_columns)


#### KNN Model
Here we'll instantiate and train a KNN model

In [167]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model
knn.fit(train_X_pca, train_y)


#### Baseline Accuracy

In [168]:
knn_pred = knn.predict(val_X_pca)
original_accuracy = accuracy_score(val_y, knn_pred)
print(f"The prediction accuracy is: {original_accuracy}")

The prediction accuracy is: 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


#### Permutation Feature-Importance

In [169]:
npermutations = 10
feature_names = train_X_pca_df.columns.tolist()
importances = {}
for col in train_X_pca_df.columns:
    importances[col] = 0

# Loop through the features and make predictions on permuted data
for col in train_X_pca_df.columns:
    for perm in range(npermutations):
        # Permute the column of the validation data
        val_X_perm = val_X_pca_df.copy()
        val_X_perm[col] = val_X_pca_df[col].sample(frac=1, replace=False).values # np.random.permutation(val_X_perm[col])
        # Make predictions on the new data
        preds = knn.predict(val_X_perm)
        # Compute the accuracy score
        permuted_accuracy = accuracy_score(val_y, preds)

        # Calculate feature importance
        importances[col] += original_accuracy - permuted_accuracy
    
    # Normalize importances
    importances[col] /= npermutations

# Display importances in descending order in a Series
feature_importances = pd.Series(importances).sort_values(ascending=False)

display(feature_importances.head(35))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


PCA_feature_1    0.488235
PCA_feature_2    0.000000
dtype: float64