In [3]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from pandas_profiling import ProfileReport

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer

from feature_engine.creation import MathFeatures
from feature_engine.imputation import MeanMedianImputer

import seaborn as sns
import matplotlib.pyplot as plt

import optuna
from warnings import filterwarnings
filterwarnings('ignore')


  from pandas import MultiIndex, Int64Index
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [4]:
def load_data(filename):
    with open(filename, 'r') as f:
        for line in f.readlines():
            yield line.strip('\n').split(',')

def keep_cols(df, thresh=.1):
    for i in range(len(df.isna().sum())):
        pct_missing = df.isna().sum()[i] / df.shape[0]
        if pct_missing < thresh:
            yield df.isna().sum().index[i]

def get_df_info(df, filename):
    cols = []
    data_types = {}
    for item in list(load_data('communities.names')):
        if item[0].startswith('@attribute'):
            col = item[0].split()[1]
            d_type = item[0].split()[-1]
            cols.append(col)
            if d_type in data_types.keys():
                data_types[d_type].append(col)
            else:
                data_types[d_type] = [col]

    df.columns = cols
    df = df.replace('?', np.nan)

    for k in data_types:
        if k == 'numeric':
            df[data_types[k]] = df[data_types[k]].astype(float)
        if k == 'string':
            df[data_types[k]] = df[data_types[k]].astype(str)

    return df

def clean_data(data_file, info_file):
    df = pd.DataFrame(list(load_data(data_file)))
    df = get_df_info(df, info_file)
    #df, encoder_dict = transform_strings(df)
    
    return df

def transform_strings(df):
    D = {}
    for col in df.select_dtypes('O').columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        D[col] = le
    
    return df, D



df = clean_data('communities.data', 'communities.names')

In [5]:
df

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8.0,,,Lakewoodcity,1.0,0.19,0.33,0.02,0.90,0.12,...,0.12,0.26,0.20,0.06,0.04,0.90,0.5,0.32,0.14,0.20
1,53.0,,,Tukwilacity,1.0,0.00,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.00,,0.67
2,24.0,,,Aberdeentown,1.0,0.00,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.00,,0.43
3,34.0,5.0,81440.0,Willingborotownship,1.0,0.04,0.77,1.00,0.08,0.12,...,0.02,0.39,0.28,,,,,0.00,,0.12
4,42.0,95.0,6096.0,Bethlehemtownship,1.0,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.00,,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,12.0,,,TempleTerracecity,10.0,0.01,0.40,0.10,0.87,0.12,...,0.01,0.28,0.05,,,,,0.00,,0.09
1990,6.0,,,Seasidecity,10.0,0.05,0.96,0.46,0.28,0.83,...,0.02,0.37,0.20,,,,,0.00,,0.45
1991,9.0,9.0,80070.0,Waterburytown,10.0,0.16,0.37,0.25,0.69,0.04,...,0.08,0.32,0.18,0.08,0.06,0.78,0.0,0.91,0.28,0.23
1992,25.0,17.0,72600.0,Walthamcity,10.0,0.08,0.51,0.06,0.87,0.22,...,0.03,0.38,0.33,0.02,0.02,0.79,0.0,0.22,0.18,0.19


In [24]:
def objective(trial):
    df, encoder_dict = clean_data('communities.data', 'communities.names')
    df.dropna(how='any', inplace=True)

    numeric_features = [col for col in df.select_dtypes(float).columns if col != df.columns[-1]]
    x_cols = [col for col in df.columns if col != df.columns[-1]]

    imp = MeanMedianImputer(
        imputation_method='mean',
        variables=x_cols
    )

    X, y = df.iloc[:, :-1], df.iloc[:, -1]

    test_size = trial.suggest_categorical('test_size', np.arange(0.1, 0.4, 0.1))

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=test_size)

    ml_model = trial.suggest_categorical('ml_model', ['lgbm', 'xgboost'])

    if ml_model == 'xgboost':
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        eta = trial.suggest_categorical('eta', np.arange(0.01, 0.2, 0.01))
        gamma = trial.suggest_int('gamma', 0, 4)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        min_child_weight = trial.suggest_int('min_child_weight', 0, 3)
        max_delta_step = trial.suggest_int('max_delta_step', 0, 3)
        subsample = trial.suggest_categorical('subsample', np.arange(0.5, 1, 0.1))
        
        model = XGBRegressor(
            n_estimators=n_estimators,
            eta=eta,
            gamma=gamma,
            max_depth=max_depth,
            min_child_weight=min_child_weight,
            max_delta_step=max_delta_step,
            subsample=subsample
        )
    else:
        n_estimators = trial.suggest_int('n_estimators', 50, 400)
        learning_rate = trial.suggest_categorical('learning_rate', np.arange(0.01, 0.3, 0.01))
        num_leaves = trial.suggest_int('num_leaves', 20, 1000, step=20)
        max_depth = trial.suggest_int('max_depth', 5, 200)
        max_bin = trial.suggest_int('max_bin', 200, 300)
        reg_alpha = trial.suggest_int('reg_alpha', 0, 100, step=5)
        reg_lambda = trial.suggest_int('reg_lambda', 0, 100, step=5)
        min_split_gain = trial.suggest_float('min_split_gain', 0, 15)

        model = LGBMRegressor(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            max_depth=max_depth,
            max_bin=max_bin,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            min_split_gain=min_split_gain
        )

    scaler = trial.suggest_categorical('scaler', ['standard', 'minmax', 'robust'])
    
    scalers = {
        'standard': StandardScaler(),
        'minmax': MinMaxScaler(),
        'robust': RobustScaler()
    }

    imputers = {
        'iterative': IterativeImputer(),
        'simple': SimpleImputer()
    }

    fe = MathFeatures(
        variables=numeric_features,
        func=['sum','prod','min','max','std']
    )
    
    pipe = Pipeline([
        ('imputation', imp),
        ('feature_engineering', fe),
        ('scaler', scalers[scaler]),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)

    trial.set_user_attr(key='best_pipeline', value=pipe)

    y_pred = pipe.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)

    return mae

def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_pipeline', value=trial.user_attrs['best_pipeline'])


if __name__ == "__main__":
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=40, callbacks=[callback])
    pipe = study.user_attrs['best_pipeline']



[32m[I 2023-01-13 22:58:41,679][0m A new study created in memory with name: no-name-ec0894df-fd34-4ca5-9247-84a8a6e0427c[0m
[32m[I 2023-01-13 22:58:42,599][0m Trial 0 finished with value: 0.20409421695245278 and parameters: {'test_size': 0.30000000000000004, 'ml_model': 'xgboost', 'n_estimators': 225, 'eta': 0.11, 'gamma': 3, 'max_depth': 3, 'min_child_weight': 2, 'max_delta_step': 3, 'subsample': 0.5, 'scaler': 'standard'}. Best is trial 0 with value: 0.20409421695245278.[0m
[32m[I 2023-01-13 22:58:43,462][0m Trial 1 finished with value: 0.10850926615595817 and parameters: {'test_size': 0.2, 'ml_model': 'xgboost', 'n_estimators': 292, 'eta': 0.14, 'gamma': 0, 'max_depth': 4, 'min_child_weight': 2, 'max_delta_step': 2, 'subsample': 0.7, 'scaler': 'robust'}. Best is trial 1 with value: 0.10850926615595817.[0m
[32m[I 2023-01-13 22:58:44,238][0m Trial 2 finished with value: 0.2391258742384144 and parameters: {'test_size': 0.1, 'ml_model': 'lgbm', 'n_estimators': 150, 'learning_

In [68]:
pipe = study.user_attrs['best_pipeline']
pipe

In [15]:
profile = ProfileReport(df, minimal=True)

In [17]:
profile.to_notebook_iframe()

Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.73s/it]


In [84]:
def keep_cols(df, thresh=.1):
    for i in range(len(df.isna().sum())):
        pct_missing = df.isna().sum()[i] / df.shape[0]
        if pct_missing < thresh:
            yield df.isna().sum().index[i]

print(df.iloc[0, :])

state                    8.00
county                    NaN
community                 NaN
communityname          804.00
fold                     1.00
                        ...  
LemasPctPolicOnPatr      0.90
LemasGangUnitDeploy      0.50
LemasPctOfficDrugUn      0.32
PolicBudgPerPop          0.14
ViolentCrimesPerPop      0.20
Name: 0, Length: 128, dtype: float64


In [85]:
cols = list(keep_cols(df))

In [87]:
df = df[cols]
df.iloc[0, :]

state                    8.00
communityname          804.00
fold                     1.00
population               0.19
householdsize            0.33
                        ...  
LandArea                 0.12
PopDens                  0.26
PctUsePubTrans           0.20
LemasPctOfficDrugUn      0.32
ViolentCrimesPerPop      0.20
Name: 0, Length: 104, dtype: float64

In [91]:
print(df.isna().sum())

state                  0
communityname          0
fold                   0
population             0
householdsize          0
                      ..
LandArea               0
PopDens                0
PctUsePubTrans         0
LemasPctOfficDrugUn    0
ViolentCrimesPerPop    0
Length: 104, dtype: int64


In [4]:
df

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8.0,,,804,1.0,0.19,0.33,0.02,0.90,0.12,...,0.12,0.26,0.20,0.06,0.04,0.90,0.5,0.32,0.14,0.20
1,53.0,,,1625,1.0,0.00,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.00,,0.67
2,24.0,,,1,1.0,0.00,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.00,,0.43
3,34.0,5.0,81440.0,1787,1.0,0.04,0.77,1.00,0.08,0.12,...,0.02,0.39,0.28,,,,,0.00,,0.12
4,42.0,95.0,6096.0,141,1.0,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.00,,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,12.0,,,1597,10.0,0.01,0.40,0.10,0.87,0.12,...,0.01,0.28,0.05,,,,,0.00,,0.09
1990,6.0,,,1455,10.0,0.05,0.96,0.46,0.28,0.83,...,0.02,0.37,0.20,,,,,0.00,,0.45
1991,9.0,9.0,80070.0,1707,10.0,0.16,0.37,0.25,0.69,0.04,...,0.08,0.32,0.18,0.08,0.06,0.78,0.0,0.91,0.28,0.23
1992,25.0,17.0,72600.0,1695,10.0,0.08,0.51,0.06,0.87,0.22,...,0.03,0.38,0.33,0.02,0.02,0.79,0.0,0.22,0.18,0.19


In [5]:
df.columns[-1]

'ViolentCrimesPerPop'

In [14]:
numeric_features = [col for col in df.select_dtypes(float).columns if col != df.columns[-1]]

fe = MathFeatures(
    variables=numeric_features,
    func=['sum','prod','min','std']
)

fe.fit_transform(df.dropna())

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop,sum_state_county_community_fold_population_householdsize_racepctblack_racePctWhite_racePctAsian_racePctHisp_agePct12t21_agePct12t29_agePct16t24_agePct65up_numbUrban_pctUrban_medIncome_pctWWage_pctWFarmSelf_pctWInvInc_pctWSocSec_pctWPubAsst_pctWRetire_medFamInc_perCapInc_whitePerCap_blackPerCap_indianPerCap_AsianPerCap_OtherPerCap_HispPerCap_NumUnderPov_PctPopUnderPov_PctLess9thGrade_PctNotHSGrad_PctBSorMore_PctUnemployed_PctEmploy_PctEmplManu_PctEmplProfServ_PctOccupManu_PctOccupMgmtProf_MalePctDivorce_MalePctNevMarr_FemalePctDiv_TotalPctDiv_PersPerFam_PctFam2Par_PctKids2Par_PctYoungKids2Par_PctTeen2Par_PctWorkMomYoungKids_PctWorkMom_NumIlleg_PctIlleg_NumImmig_PctImmigRecent_PctImmigRec5_PctImmigRec8_PctImmigRec10_PctRecentImmig_PctRecImmig5_PctRecImmig8_PctRecImmig10_PctSpeakEnglOnly_PctNotSpeakEnglWell_PctLargHouseFam_PctLargHouseOccup_PersPerOccupHous_PersPerOwnOccHous_PersPerRentOccHous_PctPersOwnOccup_PctPersDenseHous_PctHousLess3BR_MedNumBR_HousVacant_PctHousOccup_PctHousOwnOcc_PctVacantBoarded_PctVacMore6Mos_MedYrHousBuilt_PctHousNoPhone_PctWOFullPlumb_OwnOccLowQuart_OwnOccMedVal_OwnOccHiQuart_RentLowQ_RentMedian_RentHighQ_MedRent_MedRentPctHousInc_MedOwnCostPctInc_MedOwnCostPctIncNoMtg_NumInShelters_NumStreet_PctForeignBorn_PctBornSameState_PctSameHouse85_PctSameCity85_PctSameState85_LemasSwornFT_LemasSwFTPerPop_LemasSwFTFieldOps_LemasSwFTFieldPerPop_LemasTotalReq_LemasTotReqPerPop_PolicReqPerOffic_PolicPerPop_RacialMatchCommPol_PctPolicWhite_PctPolicBlack_PctPolicHisp_PctPolicAsian_PctPolicMinor_OfficAssgnDrugUnits_NumKindsDrugsSeiz_PolicAveOTWorked_LandArea_PopDens_PctUsePubTrans_PolicCars_PolicOperBudg_LemasPctPolicOnPatr_LemasGangUnitDeploy_LemasPctOfficDrugUn_PolicBudgPerPop,prod_state_county_community_fold_population_householdsize_racepctblack_racePctWhite_racePctAsian_racePctHisp_agePct12t21_agePct12t29_agePct16t24_agePct65up_numbUrban_pctUrban_medIncome_pctWWage_pctWFarmSelf_pctWInvInc_pctWSocSec_pctWPubAsst_pctWRetire_medFamInc_perCapInc_whitePerCap_blackPerCap_indianPerCap_AsianPerCap_OtherPerCap_HispPerCap_NumUnderPov_PctPopUnderPov_PctLess9thGrade_PctNotHSGrad_PctBSorMore_PctUnemployed_PctEmploy_PctEmplManu_PctEmplProfServ_PctOccupManu_PctOccupMgmtProf_MalePctDivorce_MalePctNevMarr_FemalePctDiv_TotalPctDiv_PersPerFam_PctFam2Par_PctKids2Par_PctYoungKids2Par_PctTeen2Par_PctWorkMomYoungKids_PctWorkMom_NumIlleg_PctIlleg_NumImmig_PctImmigRecent_PctImmigRec5_PctImmigRec8_PctImmigRec10_PctRecentImmig_PctRecImmig5_PctRecImmig8_PctRecImmig10_PctSpeakEnglOnly_PctNotSpeakEnglWell_PctLargHouseFam_PctLargHouseOccup_PersPerOccupHous_PersPerOwnOccHous_PersPerRentOccHous_PctPersOwnOccup_PctPersDenseHous_PctHousLess3BR_MedNumBR_HousVacant_PctHousOccup_PctHousOwnOcc_PctVacantBoarded_PctVacMore6Mos_MedYrHousBuilt_PctHousNoPhone_PctWOFullPlumb_OwnOccLowQuart_OwnOccMedVal_OwnOccHiQuart_RentLowQ_RentMedian_RentHighQ_MedRent_MedRentPctHousInc_MedOwnCostPctInc_MedOwnCostPctIncNoMtg_NumInShelters_NumStreet_PctForeignBorn_PctBornSameState_PctSameHouse85_PctSameCity85_PctSameState85_LemasSwornFT_LemasSwFTPerPop_LemasSwFTFieldOps_LemasSwFTFieldPerPop_LemasTotalReq_LemasTotReqPerPop_PolicReqPerOffic_PolicPerPop_RacialMatchCommPol_PctPolicWhite_PctPolicBlack_PctPolicHisp_PctPolicAsian_PctPolicMinor_OfficAssgnDrugUnits_NumKindsDrugsSeiz_PolicAveOTWorked_LandArea_PopDens_PctUsePubTrans_PolicCars_PolicOperBudg_LemasPctPolicOnPatr_LemasGangUnitDeploy_LemasPctOfficDrugUn_PolicBudgPerPop,min_state_county_community_fold_population_householdsize_racepctblack_racePctWhite_racePctAsian_racePctHisp_agePct12t21_agePct12t29_agePct16t24_agePct65up_numbUrban_pctUrban_medIncome_pctWWage_pctWFarmSelf_pctWInvInc_pctWSocSec_pctWPubAsst_pctWRetire_medFamInc_perCapInc_whitePerCap_blackPerCap_indianPerCap_AsianPerCap_OtherPerCap_HispPerCap_NumUnderPov_PctPopUnderPov_PctLess9thGrade_PctNotHSGrad_PctBSorMore_PctUnemployed_PctEmploy_PctEmplManu_PctEmplProfServ_PctOccupManu_PctOccupMgmtProf_MalePctDivorce_MalePctNevMarr_FemalePctDiv_TotalPctDiv_PersPerFam_PctFam2Par_PctKids2Par_PctYoungKids2Par_PctTeen2Par_PctWorkMomYoungKids_PctWorkMom_NumIlleg_PctIlleg_NumImmig_PctImmigRecent_PctImmigRec5_PctImmigRec8_PctImmigRec10_PctRecentImmig_PctRecImmig5_PctRecImmig8_PctRecImmig10_PctSpeakEnglOnly_PctNotSpeakEnglWell_PctLargHouseFam_PctLargHouseOccup_PersPerOccupHous_PersPerOwnOccHous_PersPerRentOccHous_PctPersOwnOccup_PctPersDenseHous_PctHousLess3BR_MedNumBR_HousVacant_PctHousOccup_PctHousOwnOcc_PctVacantBoarded_PctVacMore6Mos_MedYrHousBuilt_PctHousNoPhone_PctWOFullPlumb_OwnOccLowQuart_OwnOccMedVal_OwnOccHiQuart_RentLowQ_RentMedian_RentHighQ_MedRent_MedRentPctHousInc_MedOwnCostPctInc_MedOwnCostPctIncNoMtg_NumInShelters_NumStreet_PctForeignBorn_PctBornSameState_PctSameHouse85_PctSameCity85_PctSameState85_LemasSwornFT_LemasSwFTPerPop_LemasSwFTFieldOps_LemasSwFTFieldPerPop_LemasTotalReq_LemasTotReqPerPop_PolicReqPerOffic_PolicPerPop_RacialMatchCommPol_PctPolicWhite_PctPolicBlack_PctPolicHisp_PctPolicAsian_PctPolicMinor_OfficAssgnDrugUnits_NumKindsDrugsSeiz_PolicAveOTWorked_LandArea_PopDens_PctUsePubTrans_PolicCars_PolicOperBudg_LemasPctPolicOnPatr_LemasGangUnitDeploy_LemasPctOfficDrugUn_PolicBudgPerPop,std_state_county_community_fold_population_householdsize_racepctblack_racePctWhite_racePctAsian_racePctHisp_agePct12t21_agePct12t29_agePct16t24_agePct65up_numbUrban_pctUrban_medIncome_pctWWage_pctWFarmSelf_pctWInvInc_pctWSocSec_pctWPubAsst_pctWRetire_medFamInc_perCapInc_whitePerCap_blackPerCap_indianPerCap_AsianPerCap_OtherPerCap_HispPerCap_NumUnderPov_PctPopUnderPov_PctLess9thGrade_PctNotHSGrad_PctBSorMore_PctUnemployed_PctEmploy_PctEmplManu_PctEmplProfServ_PctOccupManu_PctOccupMgmtProf_MalePctDivorce_MalePctNevMarr_FemalePctDiv_TotalPctDiv_PersPerFam_PctFam2Par_PctKids2Par_PctYoungKids2Par_PctTeen2Par_PctWorkMomYoungKids_PctWorkMom_NumIlleg_PctIlleg_NumImmig_PctImmigRecent_PctImmigRec5_PctImmigRec8_PctImmigRec10_PctRecentImmig_PctRecImmig5_PctRecImmig8_PctRecImmig10_PctSpeakEnglOnly_PctNotSpeakEnglWell_PctLargHouseFam_PctLargHouseOccup_PersPerOccupHous_PersPerOwnOccHous_PersPerRentOccHous_PctPersOwnOccup_PctPersDenseHous_PctHousLess3BR_MedNumBR_HousVacant_PctHousOccup_PctHousOwnOcc_PctVacantBoarded_PctVacMore6Mos_MedYrHousBuilt_PctHousNoPhone_PctWOFullPlumb_OwnOccLowQuart_OwnOccMedVal_OwnOccHiQuart_RentLowQ_RentMedian_RentHighQ_MedRent_MedRentPctHousInc_MedOwnCostPctInc_MedOwnCostPctIncNoMtg_NumInShelters_NumStreet_PctForeignBorn_PctBornSameState_PctSameHouse85_PctSameCity85_PctSameState85_LemasSwornFT_LemasSwFTPerPop_LemasSwFTFieldOps_LemasSwFTFieldPerPop_LemasTotalReq_LemasTotReqPerPop_PolicReqPerOffic_PolicPerPop_RacialMatchCommPol_PctPolicWhite_PctPolicBlack_PctPolicHisp_PctPolicAsian_PctPolicMinor_OfficAssgnDrugUnits_NumKindsDrugsSeiz_PolicAveOTWorked_LandArea_PopDens_PctUsePubTrans_PolicCars_PolicOperBudg_LemasPctPolicOnPatr_LemasGangUnitDeploy_LemasPctOfficDrugUn_PolicBudgPerPop
16,36.0,1.0,1000.0,9,1.0,0.15,0.31,0.40,0.63,0.14,...,0.06,0.91,0.5,0.88,0.26,0.49,1082.00,0.0,0.0,89.085549
23,19.0,193.0,93926.0,1488,1.0,0.11,0.43,0.04,0.89,0.09,...,0.01,0.81,1.0,0.56,0.09,0.63,94180.85,0.0,0.0,8367.429247
33,51.0,680.0,47672.0,884,1.0,0.09,0.43,0.51,0.58,0.04,...,0.01,0.75,0.0,0.60,0.10,0.31,48445.13,0.0,0.0,4246.841813
68,34.0,23.0,58200.0,1234,1.0,0.05,0.59,0.23,0.39,0.09,...,0.02,0.64,0.0,1.00,0.23,0.50,58310.59,0.0,0.0,5184.790526
74,9.0,9.0,46520.0,957,1.0,0.08,0.39,0.08,0.85,0.04,...,0.01,0.70,1.0,0.44,0.11,0.14,46581.79,0.0,0.0,4144.287103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,34.0,39.0,40350.0,840,10.0,0.04,0.39,0.39,0.65,0.09,...,0.01,0.85,0.0,0.99,0.19,0.22,40478.31,0.0,0.0,3594.575228
1963,36.0,27.0,59641.0,1293,10.0,0.03,0.32,0.61,0.47,0.09,...,0.08,0.49,0.0,0.37,1.00,0.45,59758.06,0.0,0.0,5313.160682
1981,9.0,9.0,35650.0,635,10.0,0.07,0.38,0.17,0.84,0.11,...,0.01,0.72,0.0,0.62,0.15,0.07,35721.86,0.0,0.0,3175.903519
1991,9.0,9.0,80070.0,1707,10.0,0.16,0.37,0.25,0.69,0.04,...,0.06,0.78,0.0,0.91,0.28,0.23,80141.75,0.0,0.0,7133.151546


In [13]:
numeric_features

['state',
 'county',
 'community',
 'fold',
 'population',
 'householdsize',
 'racepctblack',
 'racePctWhite',
 'racePctAsian',
 'racePctHisp',
 'agePct12t21',
 'agePct12t29',
 'agePct16t24',
 'agePct65up',
 'numbUrban',
 'pctUrban',
 'medIncome',
 'pctWWage',
 'pctWFarmSelf',
 'pctWInvInc',
 'pctWSocSec',
 'pctWPubAsst',
 'pctWRetire',
 'medFamInc',
 'perCapInc',
 'whitePerCap',
 'blackPerCap',
 'indianPerCap',
 'AsianPerCap',
 'OtherPerCap',
 'HispPerCap',
 'NumUnderPov',
 'PctPopUnderPov',
 'PctLess9thGrade',
 'PctNotHSGrad',
 'PctBSorMore',
 'PctUnemployed',
 'PctEmploy',
 'PctEmplManu',
 'PctEmplProfServ',
 'PctOccupManu',
 'PctOccupMgmtProf',
 'MalePctDivorce',
 'MalePctNevMarr',
 'FemalePctDiv',
 'TotalPctDiv',
 'PersPerFam',
 'PctFam2Par',
 'PctKids2Par',
 'PctYoungKids2Par',
 'PctTeen2Par',
 'PctWorkMomYoungKids',
 'PctWorkMom',
 'NumIlleg',
 'PctIlleg',
 'NumImmig',
 'PctImmigRecent',
 'PctImmigRec5',
 'PctImmigRec8',
 'PctImmigRec10',
 'PctRecentImmig',
 'PctRecImmig5',
 'Pc