In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import string
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
# Function to help clean up data.
# Converts strings to numeric series.
# Leaves NaN as 0

def make_num(old_series):
    translator = str.maketrans('', '', string.punctuation)
    
    new_series = [str(num).translate(translator) for num in old_series]
    new_series = pd.to_numeric(new_series, errors='coerce')
    return pd.Series(new_series)

In [3]:
# Function to import a csv file into a clean dataframe

def clean_csv(csv_filename):

    crime_df = pd.read_csv(csv_filename)
    crime_df = crime_df.drop(crime_df.index[crime_df['Population'].isnull()]).reset_index(drop=True)

    clean_df = pd.DataFrame()
    clean_df['City'] = crime_df['City']
    clean_df['State'] = csv_filename.split('/')[1].split('_')[0].capitalize()
    clean_df['Population'] = make_num(crime_df['Population'])
    
    clean_df['ViolentCrime'] = make_num(crime_df['Violent\ncrime'])
    clean_df['Robbery'] = make_num(crime_df['Robbery'])
    clean_df['Murder'] = make_num(crime_df['Murder and\nnonnegligent\nmanslaughter'])

    clean_df['Rape'] = pd.concat([make_num(crime_df['Rape\n(legacy\ndefinition)2']), 
                            make_num(crime_df['Rape\n(revised\ndefinition)1'])], axis=1).max(axis=1)

    clean_df['Assault'] = make_num(crime_df['Aggravated\nassault'])
    
    clean_df['PropertyCrime'] = make_num(crime_df['Property\ncrime'])
    clean_df['Burglary'] = make_num(crime_df['Burglary'])
    clean_df['Larceny'] = make_num(crime_df['Larceny-\ntheft'])
    clean_df['MVTheft'] = make_num(crime_df['Motor\nvehicle\ntheft'])

    clean_df.fillna(0)
    
    return clean_df

In [4]:
df_all = pd.DataFrame()

filenames = !ls crime_data/*.csv

for state in filenames:
    df = clean_csv(state)
    df_all = df_all.append(df, ignore_index=True)

In [5]:
df_all['SqrtPop'] = np.sqrt(df_all['Population'])
df_all['SqrtRobbery'] = np.sqrt(df_all['Robbery'])
df_all['Robbery_x_Assault'] = df_all['Robbery'] * df_all['Assault']
df_all['Murder'] = [1 if x > 0 else 0 for x in df_all['Murder']]
df_all['Rape'] = [1 if x > 0 else 0 for x in df_all['Rape']]

In [6]:
X = df_all.loc[:,['Population', 'Robbery', 'Murder', 'Rape', 'Assault', 'SqrtPop', 'SqrtRobbery', 'Robbery_x_Assault']].fillna(0)
Y = np.array(df_all.loc[:,['PropertyCrime']].fillna(0))

corrmat = X.corr()

X = StandardScaler().fit_transform(X)
Y = StandardScaler().fit_transform(Y)

In [7]:
# Instantiate and fit our model.
regr = linear_model.LinearRegression()
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[ 0.61593831  0.33855028  0.00329489 -0.00155739  0.48452106  0.01618368
   0.01348234 -0.60720136]]

Intercept: 
 [ -2.59908720e-17]

R-squared:
0.946366487125


## With RFE

In [37]:
regr_rfe = linear_model.LinearRegression()
rfe_all = RFE(regr_rfe, n_features_to_select=3)

rfe_all.fit(X, Y)

print('Ranking: ' + str(rfe_all.ranking_))
rank_ind = np.where(rfe_all.ranking_==1)

X_rfe = X[:,rank_ind].squeeze()

# Instantiate and fit our model.
regr_rfe = linear_model.LinearRegression()
regr_rfe.fit(X_rfe, Y)

# Inspect the results.
print('\nCoefficients: \n', regr_rfe.coef_)
print('\nIntercept: \n', regr_rfe.intercept_)
print('\nR-squared:')
print(regr_rfe.score(X_rfe, Y))

Ranking: [1 2 5 6 1 4 3 1]

Coefficients: 
 [[ 0.88020999  0.62733937 -0.68367742]]

Intercept: 
 [ -2.64670221e-17]

R-squared:
0.929534028119


  y = column_or_1d(y, warn=True)


## With Cross_Val

In [38]:
n_folds = 8
score_regr_rfe = cross_val_score(regr_rfe, X_rfe, Y, cv=n_folds)
print("Regr w/ RFE Accuracy: %0.2f (+/- %0.2f)" % (score_regr_rfe.mean(), score_regr_rfe.std() * 2))

score_regr = cross_val_score(regr, X, Y, cv=n_folds)
print("Regr w/o RFE Accuracy: %0.2f (+/- %0.2f)" % (score_regr.mean(), score_regr.std() * 2))

Regr w/ RFE Accuracy: 0.82 (+/- 0.41)
Regr w/o RFE Accuracy: 0.91 (+/- 0.08)


Interestingly, if I allow 4 variables to pass through the RFE instead of 3, we get nearly an identical score as with the full feature set. Restricting RFE to 3 variables, however, greatly increases the variance and lowers the accuracy. 