In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [52]:
# Importing our data
data = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls', 
                     encoding='latin1', skiprows=list(range(4)))

# Condense and clean our dataset
data = data.rename(index = str, columns={'Murder and\nnonnegligent\nmanslaughter': 'Murder', 'Property\ncrime': "Property_crime"})
propertycrime = data[['City', 'Population', 'Murder', 'Robbery', 'Property_crime', 'Burglary']]
propertycrime = propertycrime.dropna()

# Drop outlier, New York
propertycrime = propertycrime.drop(propertycrime.index[216])

# Create the new rows for our regression model
propertycrime['Pop_squared'] = propertycrime['Population']**2

# Preview the data
propertycrime.head()

Unnamed: 0,City,Population,Murder,Robbery,Property_crime,Burglary,Pop_squared
0,Adams Village,1861.0,0.0,0.0,12.0,2.0,3463321.0
1,Addison Town and Village,2577.0,0.0,0.0,24.0,3.0,6640929.0
2,Akron Village,2846.0,0.0,0.0,16.0,1.0,8099716.0
3,Albany,97956.0,8.0,227.0,4090.0,705.0,9595377936.0
4,Albion Village,6388.0,0.0,4.0,223.0,53.0,40806544.0


In [46]:
# View the column statistics
propertycrime.describe()

Unnamed: 0,Population,Murder,Robbery,Property_crime,Pop_squared
count,347.0,347.0,347.0,347.0,347.0
mean,15956.686,0.605,17.867,385.752,985840709.758
std,27080.219,3.707,94.972,1034.369,5067232380.434
min,526.0,0.0,0.0,0.0,276676.0
25%,2997.0,0.0,0.0,40.0,8982153.0
50%,7187.0,0.0,1.0,112.0,51652969.0
75%,18160.5,0.0,5.0,340.5,329804222.5
max,258789.0,47.0,1322.0,12491.0,66971746521.0


In [54]:
# Construct our linear regression model with our X variables 
# and predict Y
regr = linear_model.LinearRegression()
X = propertycrime[['Population', 'Murder', 'Robbery', 'Pop_squared']]
Y = propertycrime['Property_crime'].values.reshape(-1, 1)
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[ 2.05519766e-02  1.02643381e+02  5.13001388e+00 -7.19468539e-08]]

Intercept: 
 [-25.03979961]

R-squared:
0.939283140822424


In [51]:
propertycrime.columns

Index(['City', 'Population', 'Murder', 'Robbery', 'Property_crime',
       'Pop_squared'],
      dtype='object')

In [53]:
X = propertycrime[['Population', 'Murder', 'Robbery', 'Pop_squared','Burglary']]
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[ 1.20229958e-02  1.58813353e+00 -1.62410781e+00 -1.64297690e-08
   3.62617711e+00]]

Intercept: 
 [-23.55210522]

R-squared:
0.9805504756378403


In [55]:
# Review the correlation matrix
correlation_matrix = X.corr()
display(correlation_matrix)

Unnamed: 0,Population,Murder,Robbery,Pop_squared
Population,1.0,0.756,0.816,0.889
Murder,0.756,1.0,0.963,0.884
Robbery,0.816,0.963,1.0,0.94
Pop_squared,0.889,0.884,0.94,1.0


# PCA

In [6]:
pca = PCA(n_components=1)
X = StandardScaler().fit_transform(X)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1'])
principalDf.head()

Unnamed: 0,principal component 1
0,-0.526
1,-0.513
2,-0.508
3,4.441
4,-0.42


In [7]:
# How much of our variance do we retain after limiting our data to 1 component?
pca.explained_variance_ratio_

array([0.90671722])

In [8]:
# Create a new linear regression model with our 1 variable
X = principalDf
Y = propertycrime['Property_crime'].values.reshape(-1, 1)
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[516.93434211]]

Intercept: 
 [385.75216138]

R-squared:
0.9084581990068199


In [24]:
data = principalDf['principal component 1'].apply(lambda x: x*regr.coef_+regr.intercept_)
target = propertycrime['Property_crime']
data.head()

0     [[114.0539789126928]]
1    [[120.73433640097852]]
2    [[123.25799790403505]]
3    [[2681.3815741998123]]
4    [[168.38930500239923]]
Name: principal component 1, dtype: object

In [25]:
target.head()

0     12.000
1     24.000
2     16.000
3   4090.000
4    223.000
Name: Property_crime, dtype: float64

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

ValueError: Expected 2D array, got 1D array instead:
array=[ 369.21753535  916.5592435   249.12852175  113.98893294 2061.57318113
  128.98047322  276.16262008  112.44791651  122.0280571   259.38073181
  248.3848657   131.15886341  699.77580291  111.83602433  160.25597475
  228.04841019  137.72128201  115.8400961   204.98028175  188.52754389
  345.39203664  159.13955896  154.18726401  361.98500623  312.2028729
  177.66209342  547.26318797 1219.53092364  170.03852821  439.63644351
  110.12330137  121.54033417  153.82089084  111.89163206  120.75307175
  137.58794604  284.26709309  175.24639931  451.5187258   156.00992734
  256.23853881  205.72359099  132.85804949  298.9833584   187.18556615
  120.98729887  237.11169104  126.67717257  200.80041263  156.16635707
  123.94544027  221.26897068  484.98849799  162.83675291  377.54551438
  790.92059416  152.74169842  260.58721057  169.93902958  143.89305833
  223.12138654  136.88351088  118.37703171  121.04509477  109.31909673
  154.54007593  226.63183665  127.25118645  120.61557767  188.2650713
  159.21597216  263.5069842   252.7975658   189.23455242  113.06026449
  176.95373633  119.73855734  102.95063739  112.88513867  480.89383664
  104.91472254 1166.84763003  131.90228163  105.88016157  119.19011207
  192.48754193  366.74977388  135.11543367  138.03772089  163.96657176
  168.389305    185.46795528  112.47574062  153.66542958  112.86537706
  109.64253211  214.44791347 1633.22445217  125.30875008  121.27783133
  723.6220264   119.41006461  189.5588261   130.64494876  126.67553623
  145.69168529  307.35753435  108.51568407  114.06327161  122.7695622
  257.82736701  114.65822212  591.43769165  534.19418827  120.7343364
  722.7665901   111.65069244  129.02166889  122.18756634  125.11098412
  113.31090185  129.24537263  163.67239228 1215.35798693  120.48145021
  178.69079691  736.07329519  358.19355053  118.90030296  170.62834658
  598.6351664   114.85353364  272.17142125  181.06830201  478.22516361
  417.32286105 9820.00244261  114.9744639   193.48933377  117.12625038
  472.60831425  144.05666061  385.12259235  147.55823528  147.72212374
  313.24141404  107.69462167  196.66156009  123.2579979   120.21927843
  497.24895819  152.73197995  170.65087866 1452.33411699  326.75785421
  515.00998247 1121.52873882 5491.74506411  679.37267381  202.43155252
  140.94883043  131.63315894  104.51968522 2050.01989838  234.36679881
  119.92911251 1192.8131304   120.43823305  130.68482946  115.84940889
  473.56943455 1365.42734007  944.11930906  119.63904715  157.85356262
  295.91572117  111.1968062   186.86989909  261.7177092   123.22041588
  145.66542004  203.87189801  508.22018396  184.47722188  124.19933477
  124.79088944  203.56439866  242.38385829  340.42590213  216.35794679
  138.22871635  213.66073358  105.90776249  704.49894381  106.98493303
  117.3874524   143.56594449  101.74153507  800.51109255  216.80175999
  113.41303524  220.42913657  385.05487751 1156.48894768  180.38746213
  206.98094479  107.04942455  164.29223632  302.9532804   300.76048525
  118.6479705   219.5828011   212.03778303  168.74601372  265.39836043
  296.10596899  237.77988394  129.01831097  478.27859924  140.34453944
  103.90454754 2681.3815742   309.56880674  127.76110335  119.54549913
  202.55148968  102.93230405  297.59108133  113.53375466  419.54695047
  203.2442036   316.91108496  106.06418551  481.00183018  164.17413877
  140.18214497  177.86411009  251.26055811  392.32958846  461.94337266
  941.79456079  217.96194299  112.85609786  447.19987324  142.48442625
  116.47189786  108.61722124  115.37660002  133.55000806  144.14328582
  307.50377096  109.87363586  261.85040392  191.31236331  388.49414254
  117.33616771  218.08148938  107.21528341  285.56361993  313.40728164
  159.04169263  183.54305608  394.00872867  306.21358505  347.34367716
  130.77013627  420.66677192  110.04007106  236.73432693  118.68534827
  590.30939116  176.97582955  167.70386288  132.71553089  119.99462506
  199.48057737  250.6483158 ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

# Writeup

The original features used to predict property crime were population, population squared, murder, and robbery rates. With the first regression model, our model turns out to have a r-squared value of 0.939, a very good score for our model, and an indication that we have picked the correct variables to predict with. Reducing the complexity of the murder and robbery column as we did with the previous exercise (changing values greater than 0 to 1) will decrease our r-squared value, so we're better off leaving the data as it is. Removing features will also reduce the accuracy of our model, while adding more variables (such as burglary) will increase it. This makes sense since adding more variables into the equation should provide more insight into the variables that cause these actions.