In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Package for handling logistic regression
from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

  from pandas.core import datetools


# About the data

Once again, we'll be going back to our crime data, this time for the state of Virginia. As a reminder, our dataset contains data about each city, along with its 
1. population, 
2. violent crime rate, 
3. murder, 
4. rape, 
5. robbery, 
6. aggravated assault, 
7. property crime, 
8. burglary,
9. larceny,
10. motor vehicle theft,
11. arson

# Objective

Our objective for this project is to determine simply whether a city has a severe problem with property crime or not. For convenience sake, I have defined severe property crime as having over 100 incidents per year.

In [2]:
# Importing our data
data = pd.read_excel('Data\\table_8_offenses_known_to_law_enforcement_virginia_by_city_2013.xls', 
                     encoding='latin1', skiprows=list(range(4)))

# This column contains no data, so we're going to remove it
data.drop(['Rape\n(legacy\ndefinition)2'], axis = 1, inplace = True)

# Cleaning up our column names
data.columns = data.columns.str.replace('\n', ' ')
data = data.rename(index = str, columns={'Rape (revised definition)1': 'Rape'})
data.drop(['City'], axis = 1, inplace = True)
data = data.dropna()
data = data[:-1]

In [3]:
data.head()

Unnamed: 0,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,8186.0,10.0,0.0,3.0,1.0,6.0,233.0,20.0,198.0,15.0,4.0
1,148519.0,258.0,5.0,21.0,118.0,114.0,2967.0,249.0,2427.0,291.0,13.0
2,3486.0,8.0,0.0,0.0,2.0,6.0,56.0,4.0,52.0,0.0,0.0
3,2223.0,2.0,0.0,2.0,0.0,0.0,27.0,6.0,19.0,2.0,0.0
4,1728.0,12.0,0.0,2.0,2.0,8.0,77.0,25.0,51.0,1.0,0.0


In [4]:
data.describe()

Unnamed: 0,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
count,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0
mean,20170.378,65.014,1.304,6.953,20.365,36.392,612.554,100.926,478.953,32.676,3.277
std,54170.809,201.049,4.796,18.482,72.99,111.979,1729.381,312.329,1325.7,115.662,11.514
min,112.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1438.5,1.0,0.0,0.0,0.0,0.0,19.0,3.0,16.75,0.0,0.0
50%,3347.5,5.0,0.0,1.0,0.0,4.0,62.5,9.0,54.5,2.0,0.0
75%,12460.25,25.25,0.0,4.0,6.0,13.25,272.0,27.25,251.5,11.0,1.0
max,450687.0,1418.0,37.0,140.0,624.0,842.0,11226.0,2039.0,9374.0,938.0,99.0


In [5]:
# Create a dataframe where property crimes exceed 100
y = pd.DataFrame()
y['Property crime'] = np.where(data['Property crime'] < 100, 0, 1)

# Drop property crime from the dataframe since that's the 
# variable that we want to predict
data.drop(['Property crime'], axis = 1, inplace = True)

# Construct the test and training sets
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.5)

In [14]:
# Builds the regression model
def regressionModelBuilder(X, y):
    lr = LogisticRegression(C=1e9)
    fit = lr.fit(X, y)

    # Display
    print('Coefficients')
    print(fit.coef_)
    print(fit.intercept_)
    pred_y_sklearn = lr.predict(X)

    print('\n Accuracy by severe crime prediction')
    print(pd.crosstab(pred_y_sklearn, y))

    print('\n Percentage accuracy')
    print(lr.score(X, y))

In [15]:
regressionModelBuilder(X_test, y_test)

Coefficients
[[-3.08024216e-04 -1.39131541e-01  4.07236932e-01 -5.62481085e-01
  -1.87465582e-01  2.03578193e-01 -1.19568315e-01  8.51612163e-02
   2.52449988e-01  4.43015594e-01]]
[-4.02204402]

 Accuracy by severe crime prediction
col_0  (P, r, o, p, e, r, t, y,  , c, r, i, m, e)
row_0                                            
0                                              42
1                                              32

 Percentage accuracy
0.9864864864864865


  y = column_or_1d(y, warn=True)


In [16]:
regressionModelBuilder(X_train, y_test)

Coefficients
[[ 3.98198775e-05  1.57611850e-01  5.75140929e-01 -7.36192220e-02
  -2.10648672e-01 -1.33261185e-01 -1.38372113e-03 -5.20306212e-03
   7.70072077e-03  3.58005452e-02]]
[-0.32902036]

 Accuracy by severe crime prediction
col_0  (P, r, o, p, e, r, t, y,  , c, r, i, m, e)
row_0                                            
0                                              64
1                                              10

 Percentage accuracy
0.6351351351351351


  y = column_or_1d(y, warn=True)


In [23]:
# Ridge regression model
ridgeregr = linear_model.Ridge(alpha=10, fit_intercept=False) 
ridgeregr.fit(X_train, y_train)
print('R² for the model with few features:')
print(ridgeregr.score(X_train, y_train))
origparams = ridgeregr.coef_[0]
print('\nParameter estimates for the model with few features:')
print(origparams)

R² for the model with few features:
0.15809708398626787

Parameter estimates for the model with few features:
[ 2.80005177e-05  3.22625109e-02  6.74854527e-02  1.05510401e-02
 -8.49576113e-03 -3.72782207e-02 -1.85517693e-03 -2.12019224e-04
 -1.62817585e-02 -4.87080938e-02]


In [18]:
# Lasso Regression Model
lass = linear_model.Lasso(alpha=.35)
lassfit = lass.fit(X_train, y_train)
print('R² for the model with few features:')
print(lass.score(X_train, y_train))
origparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(origparams)

R² for the model with few features:
0.2784206081950641

Parameter estimates for the model with few features:
[ 6.59563030e-06  0.00000000e+00  0.00000000e+00  0.00000000e+00
  6.66037952e-04 -4.65265293e-03  1.16415237e-04  5.07058503e-04
 -1.83025585e-03 -0.00000000e+00  3.16046805e-01]
