In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
# Importing our data
data = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls', 
                     encoding='latin1', skiprows=list(range(4)))

# Condense and clean our dataset
data = data.rename(index = str, columns={'Murder and\nnonnegligent\nmanslaughter': 'Murder', 'Property\ncrime': "Property_crime"})
propertycrime = data[['City', 'Population', 'Murder', 'Robbery', 'Property_crime']]
propertycrime = propertycrime.dropna()

# Drop outlier, New York
propertycrime = propertycrime.drop(propertycrime.index[216])

# Create the new rows for our regression model
propertycrime['Pop_squared'] = propertycrime['Population']**2

# Preview the data
propertycrime.head()

Unnamed: 0,City,Population,Murder,Robbery,Property_crime,Pop_squared
0,Adams Village,1861.0,0.0,0.0,12.0,3463321.0
1,Addison Town and Village,2577.0,0.0,0.0,24.0,6640929.0
2,Akron Village,2846.0,0.0,0.0,16.0,8099716.0
3,Albany,97956.0,8.0,227.0,4090.0,9595377936.0
4,Albion Village,6388.0,0.0,4.0,223.0,40806544.0


In [3]:
regr = linear_model.LinearRegression()
X = propertycrime[['Population', 'Murder', 'Robbery', 'Pop_squared']]
Y = propertycrime['Property_crime'].values.reshape(-1, 1)
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[ 2.05519766e-02  1.02643381e+02  5.13001388e+00 -7.19468539e-08]]

Intercept: 
 [-25.03979961]

R-squared:
0.939283140822424


In [4]:
correlation_matrix = X.corr()
display(correlation_matrix)

Unnamed: 0,Population,Murder,Robbery,Pop_squared
Population,1.0,0.756,0.816,0.889
Murder,0.756,1.0,0.963,0.884
Robbery,0.816,0.963,1.0,0.94
Pop_squared,0.889,0.884,0.94,1.0


In [5]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [6]:
pca = PCA(n_components=1)
X = StandardScaler().fit_transform(X)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1'])
principalDf.head()

Unnamed: 0,principal component 1
0,-0.526
1,-0.513
2,-0.508
3,4.441
4,-0.42


In [7]:
pca.explained_variance_ratio_

array([0.90671722])

In [8]:
X = principalDf
Y = propertycrime['Property_crime'].values.reshape(-1, 1)
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[516.93434211]]

Intercept: 
 [385.75216138]

R-squared:
0.9084581990068199
