In [24]:
import duckdb
import os, sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils 

# Base Libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Transformation
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import power_transform
from sklearn.pipeline import Pipeline
# Feature Selection
import sklearn_relief as sr
# Models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [25]:
data = utils.DBtable_to_df('../integration.duckdb', 'integratedTable')

dataN = data.astype(float)

l = len(dataN.columns)
cols  =  dataN.columns.to_list()

In [26]:

dataN.reset_index(drop=True, inplace=True)

scaler = MinMaxScaler()

dataN = power_transform(dataN, method='yeo-johnson')
dataN = scaler.fit_transform(dataN)


In [27]:
X = dataN[:,0:l-1]
y = dataN[:,l-1]

In [28]:
np.random.seed(144)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
r = sr.RReliefF(n_features = 5)
r.fit_transform(X_train,y_train)
values = r.w_
res = dict(zip(cols, values))


In [29]:

print(sorted(res.items(), key=lambda x:x[1],reverse=True))

[('Madrid_section', 0.05227810855278055), ('single_women_aged_16_to_64', 0.016008446315196184), ('single_men_aged_65_or_over', 0.015895604606090574), ('two_adults_over_35_and_one_adult_from_16_to_34', 0.011897664245560324), ('two_adults_and_three_or_more_minors', 0.011726878249723537), ('single_men_aged_16_to_64', 0.010958452781681019), ('five_adults_and_0_or_more_minors', 0.010566698851275591), ('four_adults_and_0_or_more_minors', 0.010479548836319338), ('two_adults_from_16_to_64_and_without_minors', 0.009808016961638709), ('single_women_aged_65_or_over', 0.009007451111840564), ('two_adults_and_one_minor', 0.008134259844397945), ('three_adults_and_0_or_more_minors', 0.00717515084940714), ('two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor', 0.006763520222582836), ('two_adults_over_35_and_two_adults_from_16_to_34', 0.006601198122590697), ('adult_women_with_one_or_more_minors', 0.006020207663990605), ('two_adults_and_two_minors', 0.005709431095891962), ('two_adults_one_at_le

In [30]:
np.random.seed(144)
nof_list=np.arange(1,11)     
high_score=0
nof=0           
score_list =[]
for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
    fs = sr.RReliefF(n_features = nof_list[n])
    relief = Pipeline([('fs', fs), ('m', RandomForestRegressor())])
    relief.fit(X_train,y_train)
    score = relief.score(X_test,y_test)
    score_list.append(score)
    print(f'NOF: {nof_list[n]}, Score: {score}')
    if(score > high_score):
        high_score = score
        nof = nof_list[n]

print(f'High Score: NOF: {nof}, Score: {high_score}')

NOF: 1, Score: 0.18040096409624518
NOF: 2, Score: 0.7311909792196291
NOF: 3, Score: 0.6908318675856129
NOF: 4, Score: 0.7749330547819605
NOF: 5, Score: 0.8015973296968971
NOF: 6, Score: 0.7546486217999102
NOF: 7, Score: 0.7393887887836773
NOF: 8, Score: 0.8078296355011324
NOF: 9, Score: 0.7411955555683194
NOF: 10, Score: 0.7915810922967008
High Score: NOF: 8, Score: 0.8078296355011324
None
