# Lesson 04
# Peter Lorenz

## 0. Preliminaries
Import the required libraries:

In [2]:
import matplotlib as mpl
import numpy as np
import pandas as pd

Set some global options:

In [19]:
# Display plots inline
%matplotlib inline

# Display multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Suppress scientific notation
np.set_printoptions(suppress=True)
np.set_printoptions(precision=3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

Declare utility functions:

## 1. Read data
We begin by importing our data set:

In [29]:
# Internet location of the data set
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data'

# Download the data into a dataframe object
crime_data = pd.read_csv(url)

# Add columns
crime_data.columns = ['state', 'county', 'community', 'communityname', 'fold', 'population', 
                      'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 
                      'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 
                      'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 
                      'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 
                      'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 
                      'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 
                      'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 
                      'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 
                      'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 
                      'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 
                      'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent', 
                      'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 
                      'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 
                      'PctNotSpeakEnglWell', 'PctLargHouseFam', 'PctLargHouseOccup', 
                      'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 
                      'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 'MedNumBR', 
                      'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 
                      'PctVacMore6Mos', 'MedYrHousBuilt', 'PctHousNoPhone', 'PctWOFullPlumb', 
                      'OwnOccLowQuart', 'OwnOccMedVal', 'OwnOccHiQuart', 'RentLowQ', 'RentMedian', 
                      'RentHighQ', 'MedRent', 'MedRentPctHousInc', 'MedOwnCostPctInc', 
                      'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 
                      'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85', 
                      'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 
                      'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 
                      'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 
                      'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 
                      'PolicAveOTWorked', 'LandArea', 'PopDens', 'PctUsePubTrans', 'PolicCars', 
                      'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 
                      'LemasPctOfficDrugUn', 'PolicBudgPerPop', 'ViolentCrimesPerPop']

# Display shape
crime_data.shape

# Display initial rows, all columns vertically
with pd.option_context('display.max_rows', None, 'display.max_columns', 128):
    crime_data.head().T

(1993, 128)

Unnamed: 0,0,1,2,3,4
state,53,24,34,42,6
county,?,?,5,95,?
community,?,?,81440,6096,?
communityname,Tukwilacity,Aberdeentown,Willingborotownship,Bethlehemtownship,SouthPasadenacity
fold,1,1,1,1,1
population,0.000,0.000,0.040,0.010,0.020
householdsize,0.160,0.420,0.770,0.550,0.280
racepctblack,0.120,0.490,1.000,0.020,0.060
racePctWhite,0.740,0.560,0.080,0.950,0.540
racePctAsian,0.450,0.170,0.120,0.090,1.000


### Data Cleaning
Next we clean the data to render it suitable for analysis. First we remove the non-numeric columns used for labeling and set them aside:

In [42]:
# Extract and reserve identifying columns

# Reserve the original data frame
crime_data_orig = crime_data

# Set aside state in an array and drop from data set
state_arr = np.array(crime_data['state'].values)
crime_data = crime_data.drop(['state'], axis=1)

# Set aside county in an array and drop from data set
county_arr = np.array(crime_data['county'].values)
crime_data = crime_data.drop(['county'], axis=1)

# Set aside community in an array and drop from data set
community_arr = np.array(crime_data['community'].values)
crime_data = crime_data.drop(['community'], axis=1)

# Set aside communityname in an array and drop from data set
communityname_arr = np.array(crime_data['communityname'].values)
crime_data = crime_data.drop(['communityname'], axis=1)

# Set aside fold in an array and drop from data set
fold_arr = np.array(crime_data['fold'].values)
crime_data = crime_data.drop(['fold'], axis=1)

Then we impute missing values in the remaining columns:

In [43]:
# Impute missing values using column median
crime_data = crime_data.replace('?', np.NaN)
crime_data.apply(lambda x: x.fillna(x.median()))

Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,pctWWage,pctWFarmSelf,pctWInvInc,pctWSocSec,pctWPubAsst,pctWRetire,medFamInc,perCapInc,whitePerCap,blackPerCap,indianPerCap,AsianPerCap,OtherPerCap,HispPerCap,NumUnderPov,PctPopUnderPov,PctLess9thGrade,PctNotHSGrad,PctBSorMore,PctUnemployed,PctEmploy,PctEmplManu,PctEmplProfServ,PctOccupManu,PctOccupMgmtProf,MalePctDivorce,MalePctNevMarr,FemalePctDiv,TotalPctDiv,PersPerFam,PctFam2Par,PctKids2Par,PctYoungKids2Par,PctTeen2Par,PctWorkMomYoungKids,PctWorkMom,NumIlleg,PctIlleg,NumImmig,PctImmigRecent,PctImmigRec5,PctImmigRec8,PctImmigRec10,PctRecentImmig,PctRecImmig5,PctRecImmig8,PctRecImmig10,PctSpeakEnglOnly,PctNotSpeakEnglWell,PctLargHouseFam,PctLargHouseOccup,PersPerOccupHous,PersPerOwnOccHous,PersPerRentOccHous,PctPersOwnOccup,PctPersDenseHous,PctHousLess3BR,MedNumBR,HousVacant,PctHousOccup,PctHousOwnOcc,PctVacantBoarded,PctVacMore6Mos,MedYrHousBuilt,PctHousNoPhone,PctWOFullPlumb,OwnOccLowQuart,OwnOccMedVal,OwnOccHiQuart,RentLowQ,RentMedian,RentHighQ,MedRent,MedRentPctHousInc,MedOwnCostPctInc,MedOwnCostPctIncNoMtg,NumInShelters,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LemasSwornFT,LemasSwFTPerPop,LemasSwFTFieldOps,LemasSwFTFieldPerPop,LemasTotalReq,LemasTotReqPerPop,PolicReqPerOffic,PolicPerPop,RacialMatchCommPol,PctPolicWhite,PctPolicBlack,PctPolicHisp,PctPolicAsian,PctPolicMinor,OfficAssgnDrugUnits,NumKindsDrugsSeiz,PolicAveOTWorked,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,0.000,0.160,0.120,0.740,0.450,0.070,0.260,0.590,0.350,0.270,0.020,1.000,0.310,0.720,0.110,0.450,0.250,0.290,0.390,0.290,0.370,0.380,0.330,0.160,0.300,0.22,0.350,0.010,0.240,0.140,0.240,0.300,0.270,0.730,0.570,0.150,0.420,0.360,1.000,0.630,0.910,1.000,0.290,0.430,0.470,0.600,0.390,0.460,0.530,0.000,0.240,0.010,0.520,0.620,0.640,0.630,0.250,0.270,0.250,0.230,0.840,0.100,0.160,0.100,0.170,0.290,0.170,0.260,0.200,0.820,0.000,0.020,0.790,0.240,0.020,0.250,0.650,0.160,0.000,0.210,0.200,0.210,0.420,0.380,0.400,0.370,0.290,0.320,0.180,0.000,0.000,0.210,0.500,0.340,0.600,0.520,0.020,0.180,0.970,0.210,0.040,0.170,0.290,0.180,0.740,0.780,0.120,0.060,0.000,0.200,0.040,0.570,0.260,0.020,0.120,0.450,0.080,0.030,0.750,0.500,0.000,0.155,0.670
1,0.000,0.420,0.490,0.560,0.170,0.040,0.390,0.470,0.280,0.320,0.000,0.000,0.300,0.580,0.190,0.390,0.380,0.400,0.840,0.280,0.270,0.290,0.270,0.070,0.290,0.28,0.390,0.010,0.270,0.270,0.430,0.190,0.360,0.580,0.320,0.290,0.490,0.320,0.630,0.410,0.710,0.700,0.450,0.420,0.440,0.430,0.430,0.710,0.670,0.010,0.460,0.000,0.070,0.060,0.150,0.190,0.020,0.020,0.040,0.050,0.880,0.040,0.200,0.200,0.460,0.520,0.430,0.420,0.150,0.510,0.500,0.010,0.860,0.410,0.290,0.300,0.520,0.470,0.450,0.180,0.170,0.160,0.270,0.290,0.270,0.310,0.480,0.390,0.280,0.000,0.000,0.140,0.490,0.540,0.670,0.560,0.020,0.180,0.970,0.210,0.040,0.170,0.290,0.180,0.740,0.780,0.120,0.060,0.000,0.200,0.040,0.570,0.260,0.010,0.210,0.020,0.080,0.030,0.750,0.500,0.000,0.155,0.430
2,0.040,0.770,1.000,0.080,0.120,0.100,0.510,0.500,0.340,0.210,0.060,1.000,0.580,0.890,0.210,0.430,0.360,0.200,0.820,0.510,0.360,0.400,0.390,0.160,0.250,0.36,0.440,0.010,0.100,0.090,0.250,0.310,0.330,0.710,0.360,0.450,0.370,0.390,0.340,0.450,0.490,0.440,0.750,0.650,0.540,0.830,0.650,0.850,0.860,0.030,0.330,0.020,0.110,0.200,0.300,0.310,0.050,0.080,0.110,0.110,0.810,0.080,0.560,0.620,0.850,0.770,1.000,0.940,0.120,0.010,0.500,0.010,0.970,0.960,0.600,0.470,0.520,0.110,0.110,0.240,0.210,0.190,0.750,0.700,0.770,0.890,0.630,0.510,0.470,0.000,0.000,0.190,0.300,0.730,0.640,0.650,0.020,0.180,0.970,0.210,0.040,0.170,0.290,0.180,0.740,0.780,0.120,0.060,0.000,0.200,0.040,0.570,0.260,0.020,0.390,0.280,0.080,0.030,0.750,0.500,0.000,0.155,0.120
3,0.010,0.550,0.020,0.950,0.090,0.050,0.380,0.380,0.230,0.360,0.020,0.900,0.500,0.720,0.160,0.680,0.440,0.110,0.710,0.460,0.430,0.410,0.280,0.000,0.740,0.51,0.480,0.000,0.060,0.250,0.300,0.330,0.120,0.650,0.670,0.380,0.420,0.460,0.220,0.270,0.200,0.210,0.510,0.910,0.910,0.890,0.850,0.400,0.600,0.000,0.060,0.000,0.030,0.070,0.200,0.270,0.010,0.020,0.040,0.050,0.880,0.050,0.160,0.190,0.590,0.600,0.370,0.890,0.020,0.190,0.500,0.010,0.890,0.870,0.040,0.550,0.730,0.050,0.140,0.310,0.310,0.300,0.400,0.360,0.380,0.380,0.220,0.510,0.210,0.000,0.000,0.110,0.720,0.640,0.610,0.530,0.020,0.180,0.970,0.210,0.040,0.170,0.290,0.180,0.740,0.780,0.120,0.060,0.000,0.200,0.040,0.570,0.260,0.040,0.090,0.020,0.080,0.030,0.750,0.500,0.000,0.155,0.030
4,0.020,0.280,0.060,0.540,1.000,0.250,0.310,0.480,0.270,0.370,0.040,1.000,0.520,0.680,0.200,0.610,0.280,0.150,0.250,0.620,0.720,0.760,0.770,0.280,0.520,0.48,0.600,0.010,0.120,0.130,0.120,0.800,0.100,0.650,0.190,0.770,0.060,0.910,0.490,0.570,0.610,0.580,0.440,0.620,0.690,0.870,0.530,0.300,0.430,0.000,0.110,0.040,0.300,0.350,0.430,0.470,0.500,0.500,0.560,0.570,0.450,0.280,0.250,0.190,0.290,0.530,0.180,0.390,0.260,0.730,0.000,0.020,0.840,0.300,0.160,0.280,0.250,0.020,0.050,0.940,1.000,1.000,0.670,0.630,0.680,0.620,0.470,0.590,0.110,0.000,0.000,0.700,0.420,0.490,0.730,0.640,0.020,0.180,0.970,0.210,0.040,0.170,0.290,0.180,0.740,0.780,0.120,0.060,0.000,0.200,0.040,0.570,0.260,0.010,0.580,0.100,0.080,0.030,0.750,0.500,0.000,0.155,0.140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,0.010,0.400,0.100,0.870,0.120,0.160,0.430,0.510,0.350,0.300,0.030,1.000,0.420,0.740,0.140,0.640,0.300,0.060,0.460,0.410,0.490,0.500,0.270,0.000,0.160,0.36,0.520,0.010,0.160,0.080,0.110,0.650,0.120,0.710,0.170,0.680,0.140,0.700,0.390,0.410,0.590,0.530,0.350,0.630,0.680,0.670,0.590,0.780,0.730,0.000,0.110,0.010,0.420,0.410,0.420,0.470,0.220,0.190,0.180,0.180,0.780,0.090,0.090,0.100,0.410,0.430,0.400,0.550,0.100,0.490,0.500,0.020,0.750,0.550,0.000,0.210,0.790,0.070,0.090,0.200,0.220,0.270,0.450,0.390,0.410,0.410,0.500,0.480,0.390,0.000,0.000,0.220,0.280,0.340,0.480,0.390,0.020,0.180,0.970,0.210,0.040,0.170,0.290,0.180,0.740,0.780,0.120,0.060,0.000,0.200,0.040,0.570,0.260,0.010,0.280,0.050,0.080,0.030,0.750,0.500,0.000,0.155,0.090
1989,0.050,0.960,0.460,0.280,0.830,0.320,0.690,0.860,0.730,0.140,0.060,1.000,0.280,0.760,0.100,0.300,0.260,0.370,0.480,0.240,0.180,0.200,0.260,0.180,0.200,0.23,0.260,0.040,0.320,0.330,0.370,0.220,0.240,0.770,0.100,0.410,0.220,0.240,0.420,0.750,0.560,0.480,0.710,0.660,0.630,0.690,0.700,0.320,0.400,0.030,0.420,0.050,0.560,0.620,0.630,0.670,0.700,0.680,0.630,0.610,0.550,0.360,0.570,0.570,0.740,0.520,0.870,0.120,0.670,0.600,0.000,0.030,0.800,0.220,0.170,0.130,0.500,0.170,0.370,0.360,0.370,0.360,0.540,0.540,0.610,0.540,0.690,0.670,0.310,0.010,0.000,0.530,0.250,0.170,0.100,0.000,0.020,0.180,0.970,0.210,0.040,0.170,0.290,0.180,0.740,0.780,0.120,0.060,0.000,0.200,0.040,0.570,0.260,0.020,0.370,0.200,0.080,0.030,0.750,0.500,0.000,0.155,0.450
1990,0.160,0.370,0.250,0.690,0.040,0.250,0.350,0.500,0.310,0.540,0.170,1.000,0.310,0.470,0.170,0.450,0.580,0.480,0.570,0.350,0.310,0.340,0.250,0.190,0.280,0.22,0.280,0.120,0.310,0.520,0.580,0.210,0.500,0.460,0.630,0.400,0.540,0.320,0.500,0.530,0.500,0.520,0.470,0.420,0.420,0.470,0.470,0.380,0.460,0.130,0.540,0.060,0.120,0.170,0.240,0.260,0.070,0.090,0.110,0.120,0.600,0.280,0.260,0.220,0.390,0.450,0.370,0.380,0.150,0.660,0.000,0.210,0.650,0.370,0.530,0.450,0.290,0.310,0.290,0.330,0.320,0.290,0.280,0.340,0.390,0.370,0.520,0.580,0.530,0.060,0.020,0.250,0.680,0.610,0.790,0.760,0.06,0.3,0.93,0.36,0.04,0.17,0.19,0.3,0.97,0.7,0.15,0.34,0,0.34,0.1,0.36,0.25,0.080,0.320,0.180,0.08,0.06,0.78,0,0.910,0.28,0.230
1991,0.080,0.510,0.060,0.870,0.220,0.100,0.580,0.740,0.630,0.410,0.090,1.000,0.440,0.640,0.210,0.550,0.420,0.240,0.450,0.470,0.400,0.400,0.290,0.200,0.310,0.27,0.370,0.030,0.160,0.300,0.350,0.420,0.370,0.570,0.440,0.570,0.270,0.480,0.320,0.870,0.300,0.320,0.460,0.640,0.650,0.750,0.710,0.570,0.630,0.020,0.170,0.060,0.400,0.460,0.480,0.490,0.420,0.430,0.410,0.380,0.680,0.200,0.260,0.210,0.360,0.540,0.260,0.390,0.130,0.640,0.000,0.050,0.840,0.330,0.170,0.340,0.230,0.100,0.340,0.540,0.490,0.470,0.580,0.620,0.750,0.640,0.430,0.440,0.400,0.040,0.010,0.450,0.640,0.540,0.590,0.520,0.02,0.25,0.97,0.3,0.01,0.11,0.14,0.25,0.91,0.91,0.11,0.04,0,0.09,0.01,0.57,0.19,0.030,0.380,0.330,0.02,0.02,0.79,0,0.220,0.18,0.190


In [39]:
crime_data.head().T

Unnamed: 0,0,1,2,3,4
state,53,24,34,42,6
county,,,5,95,
community,,,81440,6096,
communityname,Tukwilacity,Aberdeentown,Willingborotownship,Bethlehemtownship,SouthPasadenacity
fold,1,1,1,1,1
...,...,...,...,...,...
LemasPctPolicOnPatr,,,,,
LemasGangUnitDeploy,,,,,
LemasPctOfficDrugUn,0.000,0.000,0.000,0.000,0.000
PolicBudgPerPop,,,,,


## 2. Apply three techniques for feature selection
Here we apply three techniques for feature selection.

### Filter methods
Our first method for feature selection is *Filter methods*.

### Wrapper methods
Our second method for feature selection is *Wrapper methods*.

### Embedded methods
Our third method for feature selection is *Embedded methods*.

## 3. Describe your findings
In this assignment we found that ...