In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

from statsmodels.sandbox.regression.predstd import wls_prediction_std
from matplotlib import pyplot as plt
from itertools import combinations
from sklearn import linear_model, feature_selection
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score
from sklearn import svm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")



In [2]:
crime = pd.read_csv('offenses_by_city_2013.csv',
                    header=0,
                    names=['City', 'Population', 'ViolentCrime', 'Manslaughter', 'RapeCurrent', 'RapeLegacy', 'Robbery', 'AggravatedAssault', 'PropertyCrime', 'Burglary', 'LarcenyTheft', 'MotorVehicleTheft', 'Arson3'],
                    )
crime = crime.drop(['City', 'ViolentCrime', 'Manslaughter', 'RapeCurrent', 'RapeLegacy'], axis=1)
crime.head(10)

Unnamed: 0,Population,Robbery,AggravatedAssault,PropertyCrime,Burglary,LarcenyTheft,MotorVehicleTheft,Arson3
0,1861,0,0,12,2,10,0,0.0
1,2577,0,3,24,3,20,1,0.0
2,2846,0,3,16,1,15,0,0.0
3,97956,227,526,4090,705,3243,142,
4,6388,4,16,223,53,165,5,
5,4089,3,2,46,10,36,0,
6,1781,0,3,10,0,10,0,0.0
7,118296,31,68,2118,204,1882,32,3.0
8,9519,4,3,210,16,188,6,1.0
9,18182,12,18,405,99,291,15,0.0


In [3]:
for col in crime.columns[:-1]:
    crime[col] = crime[col].str.replace(",","")

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(crime)
crime = pd.DataFrame(data=imp.transform(crime) , columns=crime.columns)

In [4]:
crime.dtypes

Population           float64
Robbery              float64
AggravatedAssault    float64
PropertyCrime        float64
Burglary             float64
LarcenyTheft         float64
MotorVehicleTheft    float64
Arson3               float64
dtype: object

In [5]:
for col in crime.columns:
    crime[col] = crime[col].apply(pd.to_numeric).astype('int64')

In [6]:
crime.dtypes

Population           int64
Robbery              int64
AggravatedAssault    int64
PropertyCrime        int64
Burglary             int64
LarcenyTheft         int64
MotorVehicleTheft    int64
Arson3               int64
dtype: object

In [7]:
crime.describe()

Unnamed: 0,Population,Robbery,AggravatedAssault,PropertyCrime,Burglary,LarcenyTheft,MotorVehicleTheft,Arson3
count,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
mean,40037.627,72.895,121.259,792.601,119.678,637.017,35.897,1.464
std,448104.485,1026.605,1698.804,7626.827,920.976,6318.799,401.691,7.808
min,526.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3010.5,0.0,1.0,41.0,6.0,31.0,0.0,0.0
50%,7411.0,1.0,5.0,114.0,18.0,95.0,2.0,1.0
75%,19324.5,5.0,14.5,343.0,52.5,290.0,7.0,1.0
max,8396126.0,19170.0,31767.0,141971.0,16606.0,117931.0,7434.0,132.0


In [8]:
crime.isnull().sum().sort_values(ascending=False).head()

Arson3               0
MotorVehicleTheft    0
LarcenyTheft         0
Burglary             0
PropertyCrime        0
dtype: int64

In [9]:
# Everything above should be copied for use in other regressions

In [10]:
X = crime.drop('PropertyCrime', 1)
y = crime['PropertyCrime']

In [11]:
def add_interactions(X):
    # Get feature names
    combos = list(combinations(list(X.columns), 2))
    colnames = list(X.columns) + ['_'.join(X) for X in combos]
    
    # Find interactions
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    X = poly.fit_transform(X)
    X = pd.DataFrame(X)
    X.columns = colnames
    
    # Remove interaction terms with all 0 values            
    noint_indicies = [i for i, X in enumerate(list((X == 0).all())) if X]
    X = X.drop(X.columns[noint_indicies], axis=1)
    
    return X

In [12]:
X = add_interactions(X)
print(X.head(5))

   Population  Robbery  AggravatedAssault  Burglary  LarcenyTheft  \
0    1861.000    0.000              0.000     2.000        10.000   
1    2577.000    0.000              3.000     3.000        20.000   
2    2846.000    0.000              3.000     1.000        15.000   
3   97956.000  227.000            526.000   705.000      3243.000   
4    6388.000    4.000             16.000    53.000       165.000   

   MotorVehicleTheft  Arson3  Population_Robbery  \
0              0.000   0.000               0.000   
1              1.000   0.000               0.000   
2              0.000   0.000               0.000   
3            142.000   1.000        22236012.000   
4              5.000   1.000           25552.000   

   Population_AggravatedAssault  Population_Burglary  \
0                         0.000             3722.000   
1                      7731.000             7731.000   
2                      8538.000             2846.000   
3                  51524856.000         69058980

In [13]:
pca = PCA(n_components=10)
X_pca = pd.DataFrame(pca.fit_transform(X))

In [14]:
print(X_pca.head(5))

                0            1            2             3           4  \
0 -3012318183.210 -4484135.945 -1730978.710    168983.521  106593.485   
1 -3012284454.572 -4484069.527 -1727585.395    165260.928  106693.401   
2 -3012293404.019 -4488773.545 -1729494.921    165325.139  108005.481   
3 -2686014457.694 28482332.533 34840481.663 -10897451.717 1284703.623   
4 -3011264711.313 -4254515.000 -1556538.726    115384.755  126183.582   

             5           6          7         8          9  
0   -49444.581  -13940.915  -5387.518 -4005.956  -3140.881  
1   -47501.406  -14257.182  -5040.506 -3713.226  -2780.759  
2   -48865.012  -14155.095  -4940.118 -3575.827  -2604.208  
3 -1088510.661 -606225.143 162919.376 60970.081 -57216.793  
4   -63422.361   -8229.998  -3332.662 -1938.278  -1320.973  


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, train_size=0.70, random_state=1)

In [16]:
print(crime.shape)
print(X.shape)
print(X_pca.shape)
print(y.shape)

(351, 8)
(351, 28)
(351, 10)
(351,)


In [17]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
41,-3012099771.078,-4469811.275,-1680979.222,159463.079,108551.103,-45399.372,-14008.552,-3756.931,-3068.885,-1744.33
38,-3012244286.015,-4486825.038,-1716288.345,163655.002,107450.43,-49177.828,-14184.915,-5211.407,-3889.376,-2914.305
213,-3009779294.127,-4522104.757,-1273398.779,32920.978,100637.312,-18556.03,-20101.979,619.332,-458.11,3946.715
159,-3011396548.527,-4255420.903,-1501828.401,142508.51,72534.601,-28853.03,-7019.37,2329.104,-301.222,277.381
251,-3012302922.725,-4474025.92,-1728228.674,168828.427,109606.313,-49558.622,-13536.799,-4870.809,-3843.312,-2847.148


In [18]:
# Such a large set of features can cause overfitting and also slow computing
# Use feature selection to select the most important features
select = feature_selection.SelectKBest(k=10)
selected_features = select.fit(X_train, y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [X_pca.columns[i] for i in indices_selected]

X_train_selected = X_train[colnames_selected]
X_test_selected = X_test[colnames_selected]

In [19]:
print(colnames_selected)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [20]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
y_pred = regression_model.predict(X_test)

In [22]:
print(y_train.shape)
print(X_train.shape)

(245,)
(245, 10)


In [23]:
# The coefficients
print('Coefficients: \n', regression_model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [ 1.34983703e-07  2.48421886e-05  4.37404734e-05 -2.80344846e-05
  7.52224763e-05  1.78601488e-04  8.39643403e-04  7.67140884e-03
  1.80762523e-02  7.70716440e-03]
Mean squared error: 129367.75
Variance score: 0.88
