In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, chi2, VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"
data = pd.read_csv(url)

# Display first few rows
print("Dataset Loaded Successfully!\n")
print(data.head())

# The 'status' column is the target variable (1 = Parkinsonâ€™s, 0 = Healthy)
X = data.drop(columns=['name', 'status'])
y = data['status']

# Scale features for chi-square (as it requires non-negative values)
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


print("\n--- Variance Threshold ---")
vt = VarianceThreshold(threshold=0.01)  # remove features with low variance
X_vt = vt.fit_transform(X_scaled)
selected_features_vt = X.columns[vt.get_support()]
print(f"Features selected ({len(selected_features_vt)}): {list(selected_features_vt)}")


print("\n--- Information Gain ---")
info_gain = mutual_info_classif(X_scaled, y)
info_gain_series = pd.Series(info_gain, index=X.columns)
info_gain_series = info_gain_series.sort_values(ascending=False)
print(info_gain_series)

# Select top 10 based on Information Gain
top_info_gain = info_gain_series.head(10)
print("\nTop 10 Features based on Information Gain:")
print(top_info_gain)


print("\n--- Chi-Square Test ---")
chi_scores, p_values = chi2(X_scaled, y)
chi2_series = pd.Series(chi_scores, index=X.columns).sort_values(ascending=False)
print(chi2_series)

# Select top 10 based on Chi-Square
top_chi2 = chi2_series.head(10)
print("\nTop 10 Features based on Chi-Square Test:")
print(top_chi2)


print("\n--- Summary ---")
print(f"Total Features: {X.shape[1]}")
print(f"After Variance Threshold: {len(selected_features_vt)} features")
print("Top 10 Features by Information Gain:")
print(list(top_info_gain.index))
print("Top 10 Features by Chi-Square:")
print(list(top_chi2.index))


Dataset Loaded Successfully!

             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR     HNR  status      RPDE       