In [27]:
! pip install sklearn_relief

import duckdb
import os, sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils 

# Base Libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# Transformation
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import power_transform
from sklearn.pipeline import Pipeline
# Feature Selection
import sklearn_relief as sr
# Models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor



In [28]:
data = utils.DBtable_to_df('../integration.duckdb', 'integratedTable')
dataN = data.drop(columns=['Year','Madrid_section'])
dataN = dataN.astype(float)

l = len(dataN.columns)
cols  =  dataN.columns.to_list()

In [29]:

dataN.reset_index(drop=True, inplace=True)

scaler = MinMaxScaler()

dataN = power_transform(dataN, method='yeo-johnson')
dataN = scaler.fit_transform(dataN)


In [30]:
X = dataN[:,0:l-1]
y = dataN[:,l-1]

In [31]:
np.random.seed(144)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
r = sr.RReliefF(n_features = 5)
r.fit_transform(X_train,y_train)
values = r.w_
res = dict(zip(cols, values))


In [38]:

print(sorted(res.items(), key=lambda x:x[1],reverse=True))

[('two_adults_from_16_to_64_and_without_minors', 0.015594432962528468), ('single_men_aged_65_or_over', 0.015428778309626794), ('two_adults_over_35_and_two_adults_from_16_to_34', 0.012818191129674257), ('two_adults_one_at_least_65_and_without_minors', 0.011020861353819474), ('five_adults_and_0_or_more_minors', 0.010165147967174923), ('adult_women_with_one_or_more_minors', 0.009203195023015774), ('single_men_aged_16_to_64', 0.00900669801511736), ('two_adults_over_35_and_one_adult_from_16_to_34', 0.00847631291385851), ('four_adults_and_0_or_more_minors', 0.00832984282020851), ('three_adults_and_0_or_more_minors', 0.006416752529150235), ('single_women_aged_16_to_64', 0.006133410885407788), ('two_adults_and_one_minor', 0.004640239562764312), ('single_women_aged_65_or_over', 0.003011009674323502), ('adult_men_with_one_or_more_minors', 0.0023908130892677792), ('two_adults_over_35_and_one_adult_from_16_to_34_and_one_minor', 0.0005130041577153036), ('only_minors', 2.4681269711565666e-05), ('fif

In [39]:
np.random.seed(144)
nof_list=np.arange(1,11)     
high_score=0
nof=0           
score_list =[]
for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
    fs = sr.RReliefF(n_features = nof_list[n])
    relief = Pipeline([('fs', fs), ('m', RandomForestRegressor())])
    relief.fit(X_train,y_train)
    score = relief.score(X_test,y_test)
    score_list.append(score)
    print(f'NOF: {nof_list[n]}, Score: {score}')
    if(score > high_score):
        high_score = score
        nof = nof_list[n]

print (print(f'High Score: NOF: {nof}, Score: {high_score}'))

NOF: 1, Score: 0.33928371925825307
NOF: 2, Score: 0.19688446387411873
NOF: 3, Score: 0.3188146067812194
NOF: 4, Score: 0.6122951240078838
NOF: 5, Score: 0.6476286603341066
NOF: 6, Score: 0.6834289456868796
NOF: 7, Score: 0.5237402306183462
NOF: 8, Score: 0.6603478330407011
NOF: 9, Score: 0.7417354301777586
NOF: 10, Score: 0.7427913018125931
High Score: NOF: 10, Score: 0.7427913018125931
None
