In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
from IPython.display import display
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
%matplotlib inline
sns.set_style('white')

In [2]:
df = pd.read_csv(r"C:\Users\nstow\Desktop\Python\Table_8_Offenses_Known_to_Law_Enforcement_by_New_York_by_City_2014.csv")

In [3]:
## remove unusable rows

df = df.iloc[4:]
df = df.iloc[:, :-1]
df = df[:-3]
df.columns = ['city', 'population', 'violentCrime', 'murder', 'revisedRape', 'legacyRape', 'robbery', 'assault',\
              'propertyCrime', 'burglary', 'theft', 'motorVehicleTheft', 'arson']

In [4]:
## check for null values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 4 to 376
Data columns (total 13 columns):
city                 373 non-null object
population           369 non-null object
violentCrime         369 non-null object
murder               369 non-null object
revisedRape          227 non-null object
legacyRape           142 non-null object
robbery              369 non-null object
assault              369 non-null object
propertyCrime        368 non-null object
burglary             369 non-null object
theft                368 non-null object
motorVehicleTheft    369 non-null object
arson                365 non-null object
dtypes: object(13)
memory usage: 38.0+ KB


In [5]:
## remove commas so object values can be converted to numeric values
df['population'] = df['population'].str.replace(',', '')
df['violentCrime'] = df['violentCrime'].str.replace(',', '')
df['murder'] = df['murder'].str.replace(',', '')
df['revisedRape'] = df['revisedRape'].str.replace(',','')
df['legacyRape'] = df['legacyRape'].str.replace(',', '')
df['robbery'] = df['robbery'].str.replace(',', '')
df['assault'] = df['assault'].str.replace(',', '')
df['propertyCrime'] = df['propertyCrime'].str.replace(',', '')
df['burglary'] = df['burglary'].str.replace(',', '')
df['theft'] = df['theft'].str.replace(',', '')
df['motorVehicleTheft'] = df['motorVehicleTheft'].str.replace(',', '')
df['arson'] = df['arson'].str.replace(',', '')

In [6]:
## covert to numeric values
df['population'] = pd.to_numeric(df['population'])
df['violentCrime'] = pd.to_numeric(df['violentCrime'])
df['murder'] = pd.to_numeric(df['murder'])
df['revisedRape'] = pd.to_numeric(df['revisedRape'])
df['legacyRape'] = pd.to_numeric(df['legacyRape'])
df['robbery'] = pd.to_numeric(df['robbery'])
df['assault'] = pd.to_numeric(df['assault'])
df['propertyCrime'] = pd.to_numeric(df['propertyCrime'])
df['burglary'] = pd.to_numeric(df['burglary'])
df['theft'] = pd.to_numeric(df['theft'])
df['motorVehicleTheft'] = pd.to_numeric(df['motorVehicleTheft'])
df['arson'] = pd.to_numeric(df['arson'])

In [7]:
## Fill empty values with 0
df = df.fillna(0)

In [8]:
## Check data for accuracy of corrections
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 4 to 376
Data columns (total 13 columns):
city                 373 non-null object
population           373 non-null float64
violentCrime         373 non-null float64
murder               373 non-null float64
revisedRape          373 non-null float64
legacyRape           373 non-null float64
robbery              373 non-null float64
assault              373 non-null float64
propertyCrime        373 non-null float64
burglary             373 non-null float64
theft                373 non-null float64
motorVehicleTheft    373 non-null float64
arson                373 non-null float64
dtypes: float64(12), object(1)
memory usage: 38.0+ KB


In [9]:
df = df.drop(columns = 'city')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 4 to 376
Data columns (total 12 columns):
population           373 non-null float64
violentCrime         373 non-null float64
murder               373 non-null float64
revisedRape          373 non-null float64
legacyRape           373 non-null float64
robbery              373 non-null float64
assault              373 non-null float64
propertyCrime        373 non-null float64
burglary             373 non-null float64
theft                373 non-null float64
motorVehicleTheft    373 non-null float64
arson                373 non-null float64
dtypes: float64(12)
memory usage: 35.1 KB


## Vanilla Logistic Regression

In [10]:
## Create dataframe with binary columns for use with Logistic Regression
dfLog = pd.DataFrame()
dfLog['violentCrime'] = np.where(df['violentCrime'] > 0, 1, 0)
dfLog['murder'] = np.where(df['murder'] > 0, 1, 0)
dfLog['revisedRape'] = np.where(df['revisedRape'] > 0, 1, 0)
dfLog['legacyRape'] = np.where(df['legacyRape'] > 0, 1, 0)
dfLog['robbery'] = np.where(df['robbery'] > 0, 1, 0)
dfLog['assault'] = np.where(df['assault'] > 0, 1, 0)
dfLog['propertyCrime'] = np.where(df['propertyCrime'] > 0, 1, 0)
dfLog['burglary'] = np.where(df['burglary'] > 0, 1, 0)
dfLog['theft'] = np.where(df['theft'] > 0, 1, 0)
dfLog['motorVehicleTheft'] = np.where(df['motorVehicleTheft'] > 0, 1, 0)
dfLog['arson'] = np.where(df['arson'] > 0, 1, 0)

dfLogXTrain = dfLog
dfLogXTrain = dfLogXTrain.drop(columns = 'propertyCrime')

In [68]:
## set up Logistic Regression Model
logRegModel = LogisticRegression(penalty = 'l2')

X_trainLog, X_testLog, y_trainLog, y_testLog = train_test_split(
    dfLogXTrain, dfLog['propertyCrime'], test_size=0.4, random_state=0)

y_trainLog = y_trainLog.values.reshape(-1,1)
y_testLog = y_testLog.values.reshape(-1,1)

## fit the model
fitLog = logRegModel.fit(X_trainLog, y_trainLog)

  y = column_or_1d(y, warn=True)


In [69]:
coefLog = logRegModel.coef_
print('Coefficients of Log Regression with L2 penalty:\n', coefLog)

Coefficients of Log Regression with L2 penalty:
 [[0.71728937 0.07105799 0.22255019 0.         0.32679451 0.60249071
  1.52218478 1.98644483 0.6000096  0.15166862]]


#### All features were used except for Legacy Rape which was a column with only zeros, so not any usable data. Essentially, all of the columns were used.

In [70]:
print('R-Squared Log Train L2 penalty:\n', logRegModel.score(X_trainLog, y_trainLog))
print('R-Squared Log Test L2 penalty:\n', logRegModel.score(X_testLog, y_testLog))

R-Squared Log Train L2 penalty:
 0.9820627802690582
R-Squared Log Test L2 penalty:
 0.9533333333333334


#### The R-Squared value is quite high but it was cross validated with a test dataset and that value is still decently high.

In [65]:
## set up Logistic Regression Model
logRegModel1 = LogisticRegression(penalty = 'l1')

X_trainLog1, X_testLog1, y_trainLog1, y_testLog1 = train_test_split(
    dfLogXTrain, dfLog['propertyCrime'], test_size=0.4, random_state=0)

y_trainLog1 = y_trainLog1.values.reshape(-1,1)
y_testLog1 = y_testLog1.values.reshape(-1,1)

## fit the model
fitLog1 = logRegModel1.fit(X_trainLog1, y_trainLog1)

  y = column_or_1d(y, warn=True)


In [66]:
coefLog1 = logRegModel1.coef_
print('Coefficients of Log Regression with L1 penalty:\n',coefLog1)

Coefficients of Log Regression with L1 penalty:
 [[0.         0.         0.         0.         0.         0.
  2.01388386 3.7229693  0.         0.        ]]


#### Buglary and Theft were the only features that were used which makes some sense because Property Crime is composed of Burlary, Theft, Motor Vehicle Theft and Arson. 

In [67]:
print('R-Squared Lasso Train L1 penalty', logRegModel1.score(X_trainLog1, y_trainLog1))
print('R-Squared Lasso Test L1 penalty', logRegModel1.score(X_testLog1, y_testLog1))

R-Squared Lasso Train L1 penalty 1.0
R-Squared Lasso Test L1 penalty 0.9933333333333333


#### The R-Squared value is 100%, meaning that the model perfectly explains variance in the data.  This would most likely indicate severe overfitting, but the test data has a very R-Squared value as well.

## Ridge Logistic Regression

In [23]:
## Create dataframe for use with Ridge Regression
dfRidge = df
dfRidge = dfRidge.drop(columns = 'propertyCrime')

In [24]:
## set up Ridge Regression Model
ridgeRegModel = linear_model.Ridge() 

X_trainRidge, X_testRidge, y_trainRidge, y_testRidge = train_test_split(
    dfRidge, df['propertyCrime'], test_size=0.4, random_state=0)

y_trainRidge = y_trainRidge.values.reshape(-1,1)
y_testRidge = y_testRidge.values.reshape(-1,1)

## fit the model
fitRidge = ridgeRegModel.fit(X_trainRidge, y_trainRidge)

In [25]:
coefRidge = ridgeRegModel.coef_
print(coefRidge)

[[ 1.88787161e-08  1.11903085e-04  3.69987406e-04 -1.11081387e-04
   0.00000000e+00 -7.82548290e-06 -1.44005427e-04  1.00001264e+00
   9.99999700e-01  9.99843935e-01  1.30182429e-04]]


#### All features were used except Robbery. This is perhaps not the most optimal selection because we know that Legacy Rape doesn't have any useable data, since all data is zeros. It may make sense to remove Robbery as it is not used in the original dataset to calculate Property Crime.

In [26]:
print('R-Squared Ridge Train', ridgeRegModel.score(X_trainRidge, y_trainRidge))
print('R-Squared Ridge Test', ridgeRegModel.score(X_testRidge, y_testRidge))

R-Squared Ridge Train 0.9999999999999922
R-Squared Ridge Test 0.9999990981000586


#### The R-Squared value for the trainin set is very high, which may indicate overfitting, however it was cross validated wih a test set.  

## Lasso Logistic Regression

In [27]:
## Create dataframe for use with Ridge Regression
dfLasso = df
dfLasso = dfLasso.drop(columns = 'propertyCrime')

In [28]:
## set up Ridge Regression Model
lassoRegModel = linear_model.Lasso(0.2) 

X_trainLasso, X_testLasso, y_trainLasso, y_testLasso = train_test_split(
    dfLasso, df['propertyCrime'], test_size=0.4, random_state=0)

y_trainLasso = y_trainLasso.values.reshape(-1,1)
y_testLasso = y_testLasso.values.reshape(-1,1)

## fit the model
fitLasso = lassoRegModel.fit(X_trainLasso, y_trainLasso)



In [29]:
coefLasso = lassoRegModel.coef_
print(coefLasso)

[ 1.29687155e-02  2.57143194e-01  7.88330860e+01  7.60774381e+00
  0.00000000e+00 -2.65824256e+00 -5.86285119e-02  3.68938663e+00
  6.57614715e-02 -6.49795285e+00 -8.33237738e-02]


#### Only Robbery was removed from this feature set as well.

In [30]:
print('R-Squared Lasso Train', lassoRegModel.score(X_trainLasso, y_trainLasso))
print('R-Squared Lasso Test', lassoRegModel.score(X_testLasso, y_testLasso))

R-Squared Lasso Train 0.9997563994313323
R-Squared Lasso Test 0.8834918491034294


#### The R-Squared value is high for the training set, but there is a noticable drop in the test set R-Squared value which would show a clear case of overfitting

## Conclusion

#### The best choice seems to be the Vanilla Logistic Regression with L1 penalty (regularization). This model uses less features than the others. This isn't always ideal, but we know that some of the columns (which are used to create the features) in the original dataset are used to create other columns. This can be seen where Violent Crime is made of Murder, Revised Rape, Legacy Rape, Robbery and Assault. This creates a lot of redundant data for the other Regression models which could make them overly complex and not efficient. Additionally, the two columns that are remaining in the L1 penalty Logistic Regression are Burglary and Theft which are used in the original data to make the Property Crime column. The R-Squared value for both the training and test data sets are nearly ideal which alone may indicate that there is overfitting in the model, but since both have good R-Squared values, the model seems to work well.

#### Some of the limitations of the Regression Models are that the Lasso and Ridge Models only work for continous variables and the Log Model only works for binary data. It would be useful to be able to combine both of these features in one model.