In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Package for handling logistic regression
from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

  from pandas.core import datetools


In [2]:
# Importing our data
data = pd.read_excel('Data\\table_8_offenses_known_to_law_enforcement_virginia_by_city_2013.xls', 
                     encoding='latin1', skiprows=list(range(4)))

# This column contains no data, so we're going to remove it
data.drop(['Rape\n(legacy\ndefinition)2'], axis = 1, inplace = True)

# Cleaning up our column names
data.columns = data.columns.str.replace('\n', ' ')
data = data.rename(index = str, columns={'Rape (revised definition)1': 'Rape'})
data.drop(['City'], axis = 1, inplace = True)
data = data.dropna()
data = data[:-1]

In [3]:
data.head()

Unnamed: 0,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,8186.0,10.0,0.0,3.0,1.0,6.0,233.0,20.0,198.0,15.0,4.0
1,148519.0,258.0,5.0,21.0,118.0,114.0,2967.0,249.0,2427.0,291.0,13.0
2,3486.0,8.0,0.0,0.0,2.0,6.0,56.0,4.0,52.0,0.0,0.0
3,2223.0,2.0,0.0,2.0,0.0,0.0,27.0,6.0,19.0,2.0,0.0
4,1728.0,12.0,0.0,2.0,2.0,8.0,77.0,25.0,51.0,1.0,0.0


In [4]:
data.describe()

Unnamed: 0,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
count,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0
mean,20170.378,65.014,1.304,6.953,20.365,36.392,612.554,100.926,478.953,32.676,3.277
std,54170.809,201.049,4.796,18.482,72.99,111.979,1729.381,312.329,1325.7,115.662,11.514
min,112.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1438.5,1.0,0.0,0.0,0.0,0.0,19.0,3.0,16.75,0.0,0.0
50%,3347.5,5.0,0.0,1.0,0.0,4.0,62.5,9.0,54.5,2.0,0.0
75%,12460.25,25.25,0.0,4.0,6.0,13.25,272.0,27.25,251.5,11.0,1.0
max,450687.0,1418.0,37.0,140.0,624.0,842.0,11226.0,2039.0,9374.0,938.0,99.0


In [5]:
# Define the training and test sizes.
trainsize = int(data.shape[0] / 2)

# Create a dataframe where property crimes exceed 100
y = pd.DataFrame()
y['Property crime'] = np.where(data['Property crime'] < 100, 0, 1)

# Drop property crime from the dataframe since that's the 
# variable that we want to predict
data.drop(['Property crime'], axis = 1, inplace = True)

In [6]:
# Construct the test and training sets
df_test = data.iloc[trainsize:, :].copy()
y_test = y.iloc[trainsize:, :].copy()
df_train = data.iloc[:trainsize, :].copy()
y_train = y.iloc[:trainsize, :].copy()

In [7]:
# Builds the regression model
def regressionModelBuilder(X, y):
    lr = LogisticRegression(C=1e9)
    fit = lr.fit(X, y)

    # Display
    print('Coefficients')
    print(fit.coef_)
    print(fit.intercept_)
    pred_y_sklearn = lr.predict(X)

    print('\n Accuracy by admission status')
    print(pd.crosstab(pred_y_sklearn, y))

    print('\n Percentage accuracy')
    print(lr.score(X, y))

In [8]:
regressionModelBuilder(df_test, y_test)

Coefficients
[[-1.93761902e-04  3.27444050e-03  7.55980026e-05  3.75424145e-04
   1.06023788e-03  1.76318048e-03  2.51948699e-04  1.96495106e-02
   1.07589737e-03 -3.41016019e-04]]
[-0.00137817]

 Accuracy by admission status
col_0  (P, r, o, p, e, r, t, y,  , c, r, i, m, e)
row_0                                            
0                                              19
1                                              55

 Percentage accuracy
0.7432432432432432


  y = column_or_1d(y, warn=True)


In [9]:
regressionModelBuilder(df_train, y_test)

Coefficients
[[ 1.40544758e-04 -5.90381538e-02  6.99888136e-02 -2.34005951e-01
   1.99728766e-01 -9.47497821e-02  4.72857597e-03  1.15471906e-02
  -1.41360155e-01 -3.57753928e-01]]
[-0.16655995]

 Accuracy by admission status
col_0  (P, r, o, p, e, r, t, y,  , c, r, i, m, e)
row_0                                            
0                                              42
1                                              32

 Percentage accuracy
0.6216216216216216


  y = column_or_1d(y, warn=True)


In [10]:
ridgeregr = linear_model.Ridge(alpha=10, fit_intercept=False) 
ridgeregr.fit(df_train, y_train)
print(ridgeregr.score(df_train, y_train))
origparams = ridgeregr.coef_[0]
print(origparams)

0.3762206850794799
[ 1.06013834e-05 -3.42599430e-02 -1.44434599e-01  4.39060684e-02
  4.65222375e-02  1.97463506e-02 -2.07694190e-04  2.12858458e-03
 -1.64975855e-02  7.81198966e-03]


In [11]:
# Plot a line for each parameter.
for var in varstoplot:
    plt.plot(estimates_df['lambda'], estimates_df[var])
    labels.append(var)
plt.xlabel('lambda')
plt.ylabel('Parameter estimate size')
plt.legend(labels)
plt.show()

NameError: name 'varstoplot' is not defined