In [1]:
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
data = pd.read_csv('data/CommViolPredUnnormalizedData.csv', na_values=["?"])
data.head()

Unnamed: 0,communityname,state,countyCode,communityCode,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,burglaries,burglPerPop,larcenies,larcPerPop,autoTheft,autoTheftPerPop,arsons,arsonsPerPop,ViolentCrimesPerPop,nonViolPerPop
0,BerkeleyHeightstownship,NJ,39.0,5320.0,1,11980,3.1,1.37,91.78,6.5,...,14.0,114.85,138.0,1132.08,16.0,131.26,2.0,16.41,41.02,1394.59
1,Marpletownship,PA,45.0,47616.0,1,23123,2.82,0.8,95.57,3.44,...,57.0,242.37,376.0,1598.78,26.0,110.55,1.0,4.25,127.56,1955.95
2,Tigardcity,OR,,,1,29344,2.43,0.74,94.33,3.43,...,274.0,758.14,1797.0,4972.19,136.0,376.3,22.0,60.87,218.59,6167.51
3,Gloversvillecity,NY,35.0,29443.0,1,16656,2.4,1.7,97.35,0.5,...,225.0,1301.78,716.0,4142.56,47.0,271.93,,,306.64,
4,Bemidjicity,MN,7.0,5068.0,1,11245,2.76,0.53,89.16,1.17,...,91.0,728.93,1060.0,8490.87,91.0,728.93,5.0,40.05,,9988.79


In [3]:
data.shape

(2215, 147)

In [4]:
data.drop(['communityname','state', 'countyCode', 'communityCode', 'fold'], axis=1, inplace=True)

In [5]:
data.columns

Index(['population', 'householdsize', 'racepctblack', 'racePctWhite',
       'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29',
       'agePct16t24', 'agePct65up',
       ...
       'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', 'autoTheft',
       'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'ViolentCrimesPerPop',
       'nonViolPerPop'],
      dtype='object', length=142)

In [6]:
data.isnull().any()

population             False
householdsize          False
racepctblack           False
racePctWhite           False
racePctAsian           False
racePctHisp            False
agePct12t21            False
agePct12t29            False
agePct16t24            False
agePct65up             False
numbUrban              False
pctUrban               False
medIncome              False
pctWWage               False
pctWFarmSelf           False
pctWInvInc             False
pctWSocSec             False
pctWPubAsst            False
pctWRetire             False
medFamInc              False
perCapInc              False
whitePerCap            False
blackPerCap            False
indianPerCap           False
AsianPerCap            False
OtherPerCap             True
HispPerCap             False
NumUnderPov            False
PctPopUnderPov         False
PctLess9thGrade        False
                       ...  
OfficAssgnDrugUnits     True
NumKindsDrugsSeiz       True
PolicAveOTWorked        True
LandArea      

In [9]:
data.isnull().sum()

population                0
householdsize             0
racepctblack              0
racePctWhite              0
racePctAsian              0
racePctHisp               0
agePct12t21               0
agePct12t29               0
agePct16t24               0
agePct65up                0
numbUrban                 0
pctUrban                  0
medIncome                 0
pctWWage                  0
pctWFarmSelf              0
pctWInvInc                0
pctWSocSec                0
pctWPubAsst               0
pctWRetire                0
medFamInc                 0
perCapInc                 0
whitePerCap               0
blackPerCap               0
indianPerCap              0
AsianPerCap               0
OtherPerCap               1
HispPerCap                0
NumUnderPov               0
PctPopUnderPov            0
PctLess9thGrade           0
                       ... 
OfficAssgnDrugUnits    1872
NumKindsDrugsSeiz      1872
PolicAveOTWorked       1872
LandArea                  0
PopDens             

In [18]:
data.iloc[-1]

communityname           WestSacramentocity
 state                                  CA
 countyCode                              ?
 communityCode                           ?
 fold                                   10
 population                          28898
 householdsize                        2.61
 racepctblack                         2.39
 racePctWhite                        71.27
 racePctAsian                         9.09
 racePctHisp                         24.43
 agePct12t21                         12.99
 agePct12t29                         25.21
 agePct16t24                         11.63
 agePct65up                          12.12
 numbUrban                           28664
 pctUrban                            99.19
 medIncome                           23287
 pctWWage                            68.89
 pctWFarmSelf                          1.2
 pctWInvInc                          27.54
 pctWSocSec                          28.62
 pctWPubAsst                         19.05
 pctWRetire

In [8]:
data.describe()

Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,...,burglaries,burglPerPop,larcenies,larcPerPop,autoTheft,autoTheftPerPop,arsons,arsonsPerPop,ViolentCrimesPerPop,nonViolPerPop
count,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,2215.0,...,2212.0,2212.0,2212.0,2212.0,2212.0,2212.0,2124.0,2124.0,1994.0,2118.0
mean,53117.98,2.707327,9.335102,83.979819,2.670203,7.950176,14.445837,27.64484,13.975142,11.836393,...,761.23689,1033.430203,2137.629295,3372.97915,516.692586,473.965628,30.907721,32.153682,589.078922,4908.241804
std,204620.3,0.33412,14.247156,16.41908,4.473843,14.589832,4.518623,6.181517,5.970747,4.777565,...,3111.702756,763.354442,7600.573464,1901.316145,3258.164244,504.666026,180.125248,39.2409,614.784518,2739.708901
min,10005.0,1.6,0.0,2.68,0.03,0.12,4.58,9.38,4.64,1.66,...,2.0,16.92,10.0,77.86,1.0,6.55,0.0,0.0,0.0,116.79
25%,14366.0,2.5,0.86,76.32,0.62,0.93,12.25,24.415,11.32,8.75,...,95.0,511.69,392.0,2040.08,30.0,156.9525,1.0,7.67,161.7,2918.07
50%,22792.0,2.66,2.87,90.35,1.23,2.18,13.62,26.78,12.54,11.73,...,205.0,822.715,747.0,3079.51,75.0,302.355,5.0,21.08,374.06,4425.45
75%,43024.0,2.85,11.145,96.225,2.67,7.81,15.36,29.205,14.345,14.415,...,508.0,1350.2325,1675.0,4335.41,232.5,589.775,16.0,42.8525,794.4,6229.28
max,7322564.0,5.28,96.67,99.63,57.46,95.29,54.4,70.51,63.62,52.77,...,99207.0,11881.02,235132.0,25910.55,112464.0,4968.59,5119.0,436.37,4877.06,27119.76


In [10]:
# horizontal drop na from nonViolPerPop
data = data[~data.nonViolPerPop.isnull()]

In [11]:
data.nonViolPerPop.isnull().any()

False

In [13]:
clean = data.dropna(axis=1)

In [14]:
clean.shape

(2118, 112)

In [15]:
clean.columns[:100]

Index(['population', 'householdsize', 'racepctblack', 'racePctWhite',
       'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29',
       'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome',
       'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst',
       'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap',
       'indianPerCap', 'AsianPerCap', 'HispPerCap', 'NumUnderPov',
       'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
       'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ',
       'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr',
       'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par',
       'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids',
       'PctWorkMom', 'NumKidsBornNeverMar', 'PctKidsBornNeverMar', 'NumImmig',
       'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10',
       'PctRecentImmig', 'PctRecImmig5', '

In [16]:
clean.dtypes

population                 int64
householdsize            float64
racepctblack             float64
racePctWhite             float64
racePctAsian             float64
racePctHisp              float64
agePct12t21              float64
agePct12t29              float64
agePct16t24              float64
agePct65up               float64
numbUrban                  int64
pctUrban                 float64
medIncome                  int64
pctWWage                 float64
pctWFarmSelf             float64
pctWInvInc               float64
pctWSocSec               float64
pctWPubAsst              float64
pctWRetire               float64
medFamInc                  int64
perCapInc                  int64
whitePerCap                int64
blackPerCap                int64
indianPerCap               int64
AsianPerCap                int64
HispPerCap                 int64
NumUnderPov                int64
PctPopUnderPov           float64
PctLess9thGrade          float64
PctNotHSGrad             float64
          

In [20]:
clean.isnull().sum()

population               0
householdsize            0
racepctblack             0
racePctWhite             0
racePctAsian             0
racePctHisp              0
agePct12t21              0
agePct12t29              0
agePct16t24              0
agePct65up               0
numbUrban                0
pctUrban                 0
medIncome                0
pctWWage                 0
pctWFarmSelf             0
pctWInvInc               0
pctWSocSec               0
pctWPubAsst              0
pctWRetire               0
medFamInc                0
perCapInc                0
whitePerCap              0
blackPerCap              0
indianPerCap             0
AsianPerCap              0
HispPerCap               0
NumUnderPov              0
PctPopUnderPov           0
PctLess9thGrade          0
PctNotHSGrad             0
                        ..
RentLowQ                 0
RentMedian               0
RentHighQ                0
RentQrange               0
MedRent                  0
MedRentPctHousInc        0
M

In [22]:
# select the str columns
df_str = clean.select_dtypes(exclude=['int64', 'float64'])
print(df_str)

Empty DataFrame
Columns: []
Index: [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, ...]

[2118 rows x 0 columns]


In [17]:
X = clean.values[:,:100]

print(len(X))

y = clean.values[:,111]

print(len(y))

2118
2118


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearRegression()
kfold = KFold(n_splits=10, random_state=7)

result = cross_val_score(model, X, y, cv=kfold, scoring='r2')
print(result.mean())

0.540033657255


In [19]:
# feature extraction
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# Instantiate a SelectKBest object with scoring funcion and how many columns you want to select
skb= SelectKBest(score_func=f_regression, k=4)

# Fit your X and y
fit = skb.fit(X,y)

# To observe scores, zip column names and scores, then sort them in descending order.
feat_list = list(zip(data[:100].columns, skb.scores_, skb.pvalues_))
feat_list.sort(key=lambda x: x[2], reverse=False)
pd.DataFrame(feat_list, columns=['feature','f-score','p-value'])

Unnamed: 0,feature,f-score,p-value
0,PctFam2Par,1716.492336,3.059567e-275
1,PersPerFam,1651.826963,2.038351e-267
2,PctYoungKids2Par,1340.779083,8.422817e-228
3,PctKids2Par,1291.696502,3.172317e-221
4,FemalePctDiv,1226.887755,2.143630e-212
5,MalePctNevMarr,1179.257163,8.522816e-206
6,PctOccupMgmtProf,1106.253579,1.711929e-195
7,NumKidsBornNeverMar,967.485845,3.016903e-175
8,NumUnderPov,779.902666,2.246930e-146
9,PersPerRentOccHous,725.517048,1.186798e-137


In [21]:
X.shape

(2118, 100)

In [22]:
X_skb = skb.transform(X)
X_skb.shape

(2118, 4)

# 2 Fit and score a classifier on the full data and skb-transformed data

Shuffle-split cross-validation allows for control over the number of iterations independently of the training and test sizes, which can sometimes be helpful.

In [26]:
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [27]:
rf_params = {
    'n_estimators':[10,100],
    'max_depth':[10,50,None]
}

In [28]:
rf_gs = GridSearchCV(RandomForestRegressor(random_state=42),
                     param_grid= rf_params,
                     cv=ShuffleSplit(n_splits=10,
                                     random_state=42),
                     n_jobs = -1, return_train_score=True)

In [29]:
rf_gs.fit(X, y)

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size='default',
       train_size=None),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 100], 'max_depth': [10, 50, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [30]:
orig_results = pd.DataFrame(rf_gs.cv_results_)

orig_results.T

Unnamed: 0,0,1,2,3,4,5
mean_fit_time,0.733811,7.14404,0.948837,9.40295,0.945228,9.3448
mean_score_time,0.00150604,0.00730503,0.00144792,0.00956018,0.00146492,0.0097893
mean_test_score,0.484873,0.545663,0.467633,0.542848,0.467633,0.542848
mean_train_score,0.858563,0.881772,0.904969,0.930077,0.904969,0.930077
param_max_depth,10,10,50,50,,
param_n_estimators,10,100,10,100,10,100
params,"{'max_depth': 10, 'n_estimators': 10}","{'max_depth': 10, 'n_estimators': 100}","{'max_depth': 50, 'n_estimators': 10}","{'max_depth': 50, 'n_estimators': 100}","{'max_depth': None, 'n_estimators': 10}","{'max_depth': None, 'n_estimators': 100}"
rank_test_score,4,1,5,2,5,2
split0_test_score,0.550654,0.552032,0.530271,0.551256,0.530271,0.551256
split0_train_score,0.855222,0.883309,0.905231,0.933121,0.905231,0.933121


In [31]:
rf_gs.fit(X_skb, y)

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size='default',
       train_size=None),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 100], 'max_depth': [10, 50, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [32]:
skb_results = pd.DataFrame(rf_gs.cv_results_)

skb_results.T

Unnamed: 0,0,1,2,3,4,5
mean_fit_time,0.0515941,0.428453,0.0562636,0.567724,0.0573938,0.546266
mean_score_time,0.00131779,0.00712724,0.00135252,0.00961246,0.0014425,0.00924547
mean_test_score,0.400586,0.42449,0.35627,0.403944,0.35627,0.403944
mean_train_score,0.7706,0.791558,0.885538,0.914544,0.885538,0.914544
param_max_depth,10,10,50,50,,
param_n_estimators,10,100,10,100,10,100
params,"{'max_depth': 10, 'n_estimators': 10}","{'max_depth': 10, 'n_estimators': 100}","{'max_depth': 50, 'n_estimators': 10}","{'max_depth': 50, 'n_estimators': 100}","{'max_depth': None, 'n_estimators': 10}","{'max_depth': None, 'n_estimators': 100}"
rank_test_score,4,1,5,2,5,2
split0_test_score,0.449035,0.446771,0.392812,0.421619,0.392812,0.421619
split0_train_score,0.768576,0.792469,0.884528,0.916061,0.884528,0.916061


## Select from model

In [34]:
from sklearn.feature_selection import SelectFromModel

In [35]:
# Instantiate a SelectFromModel object with an estimator (any model with coefs_ or feature_importances_) and 
sfm = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=42))

# Fit your X and y
sfm.fit(X, y)

SelectFromModel(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=None)

In [36]:
# SelectFromModel has no .scores_ attribute. Instead, let's look at .get_support()
sfm.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False,  True, False,  True,  True, False,  True,  True, False,
        True, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False, False], dtype=bool)

In [37]:
# To observe scores, zip column names and scores, then sort them in descending order.
feat_list_sfm = list(zip(data[:100].columns,sfm.get_support()))
feat_list_sfm.sort(key=lambda x: x[1], reverse=True)
pd.DataFrame(feat_list_sfm, columns=['Feature','support'])

Unnamed: 0,Feature,support
0,HispPerCap,True
1,PctOccupMgmtProf,True
2,MalePctNevMarr,True
3,FemalePctDiv,True
4,PersPerFam,True
5,PctFam2Par,True
6,PctYoungKids2Par,True
7,PctWorkMom,True
8,PersPerOwnOccHous,True
9,PctPersOwnOccup,True


In [38]:
rf_feat_list = list(zip(data[:100].columns, sfm.estimator_.feature_importances_))
rf_feat_list.sort(key=lambda x: x[1], reverse=True)
pd.DataFrame(rf_feat_list, columns=['feature','importance'])

Unnamed: 0,feature,importance
0,PctFam2Par,0.349483
1,PersPerFam,0.075918
2,FemalePctDiv,0.030466
3,PctYoungKids2Par,0.029129
4,OwnOccQrange,0.027713
5,PctOccupMgmtProf,0.018202
6,MalePctNevMarr,0.016981
7,NumInShelters,0.016535
8,MedNumBR,0.015696
9,HousVacant,0.014016


In [40]:
X_sfm_rf = sfm.transform(X)
X_sfm_rf.shape

(2118, 14)

In [41]:
from sklearn.model_selection import cross_val_score

In [42]:
display(sfm.estimator_)

display(cross_val_score(sfm.estimator_, X, y, cv=3, n_jobs=-1).mean())

display(cross_val_score(sfm.estimator_, X_sfm_rf, y, cv=3, n_jobs=-1).mean())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

0.51343727940382078

0.49092177541908022

In [44]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [45]:
display(cross_val_score(Lasso(), X, y, cv=3, n_jobs=-1).mean())

display(cross_val_score(Lasso(), X_sfm_rf, y, cv=3, n_jobs=-1).mean())



0.52428870791328985



0.47036243437867076

In [46]:
enet_params = {
    'alpha':np.logspace(-3,3)
}

In [47]:
enet_gs = GridSearchCV(ElasticNet(), param_grid=enet_params,
                       n_jobs=-1, cv=ShuffleSplit())

In [48]:
enet_gs.fit(X, y)



































GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=None, test_size='default',
       train_size=None),
       error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'alpha': array([  1.00000e-03,   1.32571e-03,   1.75751e-03,   2.32995e-03,
         3.08884e-03,   4.09492e-03,   5.42868e-03,   7.19686e-03,
         9.54095e-03,   1.26486e-02,   1.67683e-02,   2.22300e-02,
         2.94705e-02,   3.90694e-02,   5.17947e-02,   6.86649e-02,
         9....    2.44205e+02,   3.23746e+02,   4.29193e+02,   5.68987e+02,
         7.54312e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [49]:
sfm_enet = SelectFromModel(enet_gs.best_estimator_, prefit=True)

In [50]:
sfm_enet.get_support()

array([False, False, False, False, False, False,  True,  True,  True,
        True, False, False, False, False, False, False,  True, False,
        True, False, False, False, False, False, False, False, False,
        True,  True,  True,  True, False,  True, False, False, False,
        True,  True,  True,  True,  True, False, False,  True, False,
       False, False, False, False,  True, False, False,  True, False,
       False,  True,  True,  True,  True,  True, False, False, False,
       False, False,  True,  True, False, False,  True, False,  True,
        True, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False,  True, False, False, False, False, False, False, False], dtype=bool)

In [51]:
sfm_enet_results = list(zip(data[:100].columns,
                            sfm_enet.estimator.coef_,
                            sfm_enet.get_support()))

sfm_enet_results.sort(key=lambda x: x[1], reverse=True)

In [52]:
sfm_enet_results

[('PctOccupMgmtProf', 102.97202768224612, True),
 ('pctWSocSec', 94.74516676523713, True),
 ('NumStreet', 90.389579331876831, True),
 ('PctUnemployed', 86.152576891012316, True),
 ('MalePctDivorce', 83.58517213048647, True),
 ('FemalePctDiv', 64.246068484391088, True),
 ('NumUnderPov', 53.334703737840293, True),
 ('NumKidsBornNeverMar', 48.926377206834289, True),
 ('PctHousOccup', 44.482804533865455, True),
 ('PctOccupManu', 43.407302579580453, True),
 ('MalePctNevMarr', 34.399602921399008, True),
 ('MedYrHousBuilt', 29.595647115679522, True),
 ('agePct65up', 26.428970937468762, True),
 ('agePct12t21', 24.597595467044346, True),
 ('PctSpeakEnglOnly', 20.191350724441357, False),
 ('pctWWage', 18.823272852447211, False),
 ('PctVacMore6Mos', 18.461278802142207, False),
 ('PctEmplProfServ', 17.916755688307809, False),
 ('PctNotSpeakEnglWell', 17.864015318425633, False),
 ('PctLargHouseFam', 17.81226387774857, False),
 ('PctPersOwnOccup', 16.647233496079487, False),
 ('PctImmigRec5', 15.155

## Recursive Feature Elimination

In [141]:
clean['population'].astype(np.float64, copy=False)
clean['population'].dtypes

dtype('int64')

In [104]:
np.dtype('float64')

dtype('float64')

In [142]:
clean.dtypes

population                 int64
householdsize            float64
racepctblack             float64
racePctWhite             float64
racePctAsian             float64
racePctHisp              float64
agePct12t21              float64
agePct12t29              float64
agePct16t24              float64
agePct65up               float64
numbUrban                  int64
pctUrban                 float64
medIncome                  int64
pctWWage                 float64
pctWFarmSelf             float64
pctWInvInc               float64
pctWSocSec               float64
pctWPubAsst              float64
pctWRetire               float64
medFamInc                  int64
perCapInc                  int64
whitePerCap                int64
blackPerCap                int64
indianPerCap               int64
AsianPerCap                int64
HispPerCap                 int64
NumUnderPov                int64
PctPopUnderPov           float64
PctLess9thGrade          float64
PctNotHSGrad             float64
          

In [143]:
unclean = []
for col in clean.columns:
    if clean[col].dtypes == np.dtype('int64'):
        clean[col].values.astype(np.float64, copy=False)
        unclean.append(col)
print(unclean)
clean.population.dtypes != np.dtype('int64')

['population', 'numbUrban', 'medIncome', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'HispPerCap', 'NumUnderPov', 'NumKidsBornNeverMar', 'NumImmig', 'MedNumBR', 'HousVacant', 'MedYrHousBuilt', 'OwnOccLowQuart', 'OwnOccMedVal', 'OwnOccHiQuart', 'OwnOccQrange', 'RentLowQ', 'RentMedian', 'RentHighQ', 'RentQrange', 'MedRent', 'NumInShelters', 'NumStreet', 'murders']


False

In [153]:
for i, val in clean.loc[0,:].iteritems():
    if val == 5117.28:
        print(i, val)

In [152]:
X.astype(np.float64).dtype

dtype('float64')

In [87]:
y.dtype

dtype('float64')

In [167]:
X = clean.values[:,:100]

print(len(X))

y = clean.values[:,111]

print(len(y))

2118
2118


In [178]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearRegression()
kfold = KFold(n_splits=10, random_state=7)

result = cross_val_score(model, X, y, cv=kfold, scoring='r2')
print(result.mean())

0.540033657254


In [182]:
# feature extraction
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
test= SelectKBest(score_func=f_regression, k=4)
fit = test.fit(X,y)
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
print(features[0:5, :])

[  3.272e+01   8.272e+01   6.503e+02   6.580e+02   2.936e+00   6.641e+01
   1.881e+00   2.717e+01   1.079e+01   4.330e+01   3.173e+01   6.989e-05
   6.102e+02   2.827e+02   1.808e+01   6.810e+02   6.786e+01   6.150e+02
   6.496e+00   5.772e+02   2.534e+02   1.127e+02   1.306e+02   1.389e+01
   8.223e+01   1.559e+02   4.739e+01   7.799e+02   2.150e+02   3.741e+02
   1.836e+02   4.237e+02   2.586e+02   2.208e+01   2.790e+00   1.251e+02
   2.077e+02   1.106e+03   1.045e+02   1.179e+03   1.227e+03   4.753e+00
   1.652e+03   1.716e+03   1.292e+03   1.341e+03   2.814e+00   6.452e+00
   3.532e+01   9.675e+02   7.433e+00   6.665e+01   9.734e+01   1.375e+02
   1.835e+02   1.909e+01   2.392e+01   2.453e+01   2.860e+01   2.847e+01
   4.848e+01   7.108e+01   2.144e+01   9.347e+01   1.756e+02   1.910e+01
   7.255e+02   1.411e+02   6.203e+02   4.011e+02   8.176e+01   2.238e+02
   5.878e+02   2.829e+02   6.125e-01   7.327e+00   7.185e+02   1.513e+02
   2.026e+02   1.789e+02   1.606e+02   6.173e+01   

In [180]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

ValueError: Unknown label type: 'continuous'