In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from dataset import FeaturesDataset

# Load the dataset
dataset = FeaturesDataset(dataset_dir='data/dataset/oversampling', normalize=True)

In [None]:
# X_df = pd.read_csv('data/dataset/dfu_features_dataset_selected.csv', index_col=0)
# y_df = pd.read_csv('data/dataset/dfu_labels_dataset.csv', index_col=0)

X_df = pd.read_csv('data/dataset/oversampling/dfu_features_dataset_selected.csv')
y_df = pd.read_csv('data/dataset/oversampling/dfu_labels_dataset.csv')

# Random Forest Feature Ranking

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

# X = X_df.to_numpy()
# y = y_df.to_numpy()

X = dataset.X
y = dataset.y

features_importance = []
model_accuracy = []

clf = RandomForestClassifier(n_estimators=250, random_state=42, n_jobs=-1)
for fold, (train_ids, test_ids) in enumerate(kfold.split(X)):
    print('Fold: {}'.format(fold))
    X_train = X[train_ids]
    y_train = y[train_ids]

    X_test = X[test_ids]
    y_test = y[test_ids]

    # use RandomForestClassifier for feature ranking
    clf.fit(X_train, y_train.ravel())
    
    # get importance
    features_importance.append(clf.feature_importances_) 

    # testing
    y_pred = clf.predict(X_test)
    model_accuracy.append(accuracy_score(y_test, y_pred))

In [None]:
# Mean accuracy
print('Mean accuracy: {}'.format(np.mean(model_accuracy)))

In [None]:
# Save the features importance in a CSV file
features_importance_ = np.array(features_importance)
features_importance_ = np.mean(features_importance_, axis=0)

# Sort the features importance and save on csv file with the feature name
sort_idx = np.argsort(features_importance_)[::-1]
features_importance_ = features_importance_[sort_idx]
features_name = np.array(dataset.features)[sort_idx]

features_importance_df = pd.DataFrame(features_importance_, index=features_name, columns=['Importance'])
features_importance_df.index.name = 'Features'

features_importance_df.to_csv('data/features_importance/oversampling/random_forest.csv')

# LASSO

In [None]:
# Feature Selection by Lasso, classificatiom
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

X = dataset.X
y = dataset.y

features_importance = []
model_accuracy = []

clf = LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
for fold, (train_ids, test_ids) in enumerate(kfold.split(X)):
    print('Fold: {}'.format(fold))
    X_train = X[train_ids]
    y_train = y[train_ids]

    X_test = X[test_ids]
    y_test = y[test_ids]

    # use RandomForestClassifier for feature ranking
    clf.fit(X_train, y_train.ravel())
    
    # get importance
    features_importance.append(clf.coef_) 

    # testing
    y_pred = clf.predict(X_test)
    model_accuracy.append(accuracy_score(y_test, y_pred))

In [None]:
# Mean accuracy
print('Mean accuracy: {}'.format(np.mean(model_accuracy)))

In [None]:
# Save the features importance in a CSV file
features_importance_ = np.array(features_importance)
features_importance_ = np.mean(features_importance_, axis=0)
features_importance_ = np.abs(features_importance_).ravel()

# Sort the features importance and save on csv file with the feature name
sort_idx = np.argsort(features_importance_)[::-1]
features_importance_ = features_importance_[sort_idx]
# features_name = X_df.columns[sort_idx]
features_name = np.array(dataset.features)[sort_idx]


features_importance_df = pd.DataFrame(features_importance_, index=features_name, columns=['Importance'])
features_importance_df.index.name = 'Features'

features_importance_df.to_csv('data/features_importance/oversampling/lasso.csv')

# Chi-Squared for Feature Selection

In [None]:
# Deal with possible negative values for the chi2 test
# X_df = X_df + np.abs(X_df.min().min())

In [None]:
# Chi-squared for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier

k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

# Necessary to remove negative values, simple normalization
X = dataset.X
X = (X - X.min(axis=0))
X = X / X.max(axis=0)
y = dataset.y

features_importance = []
model_accuracy = []

for fold, (train_ids, test_ids) in enumerate(kfold.split(X)):
    print('Fold: {}'.format(fold))
    X_train = X[train_ids]
    y_train = y[train_ids]

    X_test = X[test_ids]
    y_test = y[test_ids]

    # use RandomForestClassifier for feature ranking
    clf = SelectKBest(chi2, k=10)
    clf.fit(X_train, y_train.ravel())
    
    # get importance
    features_importance.append(clf.scores_) 

    # testing
    X_train = clf.transform(X_train)
    X_test = clf.transform(X_test)
    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train.ravel())
    y_pred = clf.predict(X_test)
    model_accuracy.append(accuracy_score(y_test, y_pred))

In [None]:
# Save the features importance in a CSV file
features_importance_ = np.array(features_importance)
features_importance_ = np.mean(features_importance_, axis=0)
features_importance_ = np.abs(features_importance_).ravel()

# Sort the features importance and save on csv file with the feature name
sort_idx = np.argsort(features_importance_)[::-1]
features_importance_ = features_importance_[sort_idx]
# features_name = X_df.columns[sort_idx]
features_name = np.array(dataset.features)[sort_idx]

features_importance_df = pd.DataFrame(features_importance_, index=features_name, columns=['Importance'])
features_importance_df.index.name = 'Features'

features_importance_df.to_csv('data/features_importance/oversampling/chi2.csv')