# Feature Importance using random forest

imports

In [40]:
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

import os 
from pathlib import Path

from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np

In [41]:

# read dynamic path
base_dir = Path(os.getcwd()) / "implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from enum import Enum
from numpy import ndarray

def getExpectedValue(dataset) -> ndarray:
    return np.mean(dataset)

def calculate_rbf_kernel(dataset1, dataset2, sigma: float = 1.0):
    sq_dist = np.sum((dataset1[:, np.newaxis] - dataset2) ** 2, axis=-1)
    return np.exp(-sq_dist / (2 * sigma ** 2))

def calculate_mmd_score(data1, data2):
    def kernel(x, y):
        """
        Calculates the kernel matrix between two datasets.

        Parameters:
        x (numpy array): The first dataset.
        y (numpy array): The second dataset.
        kernel_type (str): The type of kernel function to use. Must be one of "rbf", "linear" or "exponential".
        **kwargs: Additional arguments for the kernel function.

        Returns:
        numpy array: The kernel matrix between the two datasets.
        """
        pairwise_distances = np.linalg.norm(x[:, np.newaxis] - y, axis=2)
        sigma = np.median(pairwise_distances) / np.sqrt(2) + 1e-6
        K = calculate_rbf_kernel(x, y, sigma)


        return K

    # calculate the kernel matrix for each dataset
    K_xx = kernel(data1, data1)
    K_yy = kernel(data2, data2)
    K_xy = kernel(data1, data2)

    # calculate MMD^2 score
    mmd_sq = getExpectedValue(K_xx) + getExpectedValue(K_yy) - 2 * getExpectedValue(K_xy)

    return abs(mmd_sq)

In [43]:
smote_data_raw = pd.read_csv(data_dir/"ACHE/ache.csv")

#format data 
lookup = {'inactive':0,'active':1}

smote_data = {'data': np.array(smote_data_raw.iloc[:, 2:-1]),
             'target': np.array([lookup[y] for y in smote_data_raw.iloc[0:,-1]]),
             'feature_names': smote_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}

In [44]:
X_train, X_test, y_train, y_test = train_test_split(smote_data['data'], smote_data['target'],
                                                    test_size=0.3, random_state=4232)

In [45]:
df = pd.DataFrame(X_train)
before_transform = df.to_numpy()

sm = SMOTE(random_state=42, k_neighbors=5)
X_res, y_res = sm.fit_resample(df, pd.DataFrame(y_train))

after_transform = X_res.to_numpy()

calculate_mmd_score(before_transform,after_transform)



0.01456975495408519

In [46]:
# X_train + y_train and export 
# X_test + y_test and export

cols = smote_data_raw.columns
df = pd.DataFrame(columns=cols)
df = df.drop(columns=["NAME","LABEL"])

for i,r in enumerate(X_train):
    row = [i]
    row.extend(r)
    df.loc[len(df["INDEX"])] = row
df["LABEL"] = y_train
df.to_csv(data_dir/"ACHE/ache_smote_train.csv",sep=",")


cols = smote_data_raw.columns
df = pd.DataFrame(columns=cols)
df = df.drop(columns=["NAME","LABEL"])

for i,r in enumerate(X_test):
    row = [i]
    row.extend(r)
    df.loc[len(df["INDEX"])] = row
df["LABEL"] = y_test
df.to_csv(data_dir/"ACHE/ache_smote_test.csv",sep=",")
