In [1]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import pyodbc

server='localhost'
database='CrimeTimeDW'
username='awdemo'
password='Atlanta2022'

cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)

df = pd.read_sql_query("""select d.weekdayNo, d.Month_no, w.description as weatherdesc, w.temperature_C, f.neighborhoodKey, f.lat, f.long, f.crimeTypeKey as crime
from dimDate d JOIN weather w on d.datekey = w.DateKey
JOIN factCrime f ON f.dateKey = d.datekey 
JOIN dimCrime c ON f.crimeTypeKey = c.CrimeTypeKey
""", cnxn)
df.to_csv('datasets/MLcrimetype_all.csv')

In [2]:

# load the dataset
print("Loading Data")
crimes = pd.read_csv('datasets\\MLcrimetype_all.csv')

del crimes['Unnamed: 0']
crimes

Loading Data


Unnamed: 0,weekdayNo,Month_no,weatherdesc,temperature_C,neighborhoodKey,lat,long,crime
0,6,10,sky is clear,2.00,173,33.74574,-84.42509,3
1,6,10,sky is clear,2.00,229,33.80901,-84.37487,3
2,6,10,sky is clear,2.00,79,33.75664,-84.41175,3
3,6,10,sky is clear,2.00,148,33.74173,-84.40283,3
4,6,10,sky is clear,2.00,79,33.75611,-84.41453,3
...,...,...,...,...,...,...,...,...
129181,4,9,sky is clear,22.35,192,33.73166,-84.37352,4
129182,4,9,sky is clear,22.35,16,33.85209,-84.36450,4
129183,4,9,sky is clear,22.35,231,33.75276,-84.34968,4
129184,4,9,sky is clear,22.35,67,33.76686,-84.45461,4


In [3]:
# classification of categories
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(crimes['weatherdesc'])
crimes['weatherdesc'] = le.transform(crimes['weatherdesc'])

crimes

Unnamed: 0,weekdayNo,Month_no,weatherdesc,temperature_C,neighborhoodKey,lat,long,crime
0,6,10,17,2.00,173,33.74574,-84.42509,3
1,6,10,17,2.00,229,33.80901,-84.37487,3
2,6,10,17,2.00,79,33.75664,-84.41175,3
3,6,10,17,2.00,148,33.74173,-84.40283,3
4,6,10,17,2.00,79,33.75611,-84.41453,3
...,...,...,...,...,...,...,...,...
129181,4,9,17,22.35,192,33.73166,-84.37352,4
129182,4,9,17,22.35,16,33.85209,-84.36450,4
129183,4,9,17,22.35,231,33.75276,-84.34968,4
129184,4,9,17,22.35,67,33.76686,-84.45461,4


In [4]:
# auto theft as one, others 0
condition = crimes['crime'] != 5
#crimes[condition] = crimes['crime']
crimes.loc[condition, 'crime'] = 0
crimes

Unnamed: 0,weekdayNo,Month_no,weatherdesc,temperature_C,neighborhoodKey,lat,long,crime
0,6,10,17,2.00,173,33.74574,-84.42509,0
1,6,10,17,2.00,229,33.80901,-84.37487,0
2,6,10,17,2.00,79,33.75664,-84.41175,0
3,6,10,17,2.00,148,33.74173,-84.40283,0
4,6,10,17,2.00,79,33.75611,-84.41453,0
...,...,...,...,...,...,...,...,...
129181,4,9,17,22.35,192,33.73166,-84.37352,0
129182,4,9,17,22.35,16,33.85209,-84.36450,0
129183,4,9,17,22.35,231,33.75276,-84.34968,0
129184,4,9,17,22.35,67,33.76686,-84.45461,0


In [5]:
crimes.loc[condition == False, 'crime'] = 1

In [7]:
# Separate features and labels
X, y = crimes[['weekdayNo', 'Month_no', 'weatherdesc','temperature_C','neighborhoodKey', 'lat', 'long']].values, crimes['crime'].values
print(X)
print(y)

[[  6.       10.       17.      ... 173.       33.74574 -84.42509]
 [  6.       10.       17.      ... 229.       33.80901 -84.37487]
 [  6.       10.       17.      ...  79.       33.75664 -84.41175]
 ...
 [  4.        9.       17.      ... 231.       33.75276 -84.34968]
 [  4.        9.       17.      ...  67.       33.76686 -84.45461]
 [  4.        9.       17.      ... 188.       33.71664 -84.37816]]
[0 0 0 ... 0 0 0]


In [8]:
# Scaling data with Scaler, you need Import for scaler as below
from sklearn.preprocessing import StandardScaler
#-----------------------------------------------------------------------------

trans = StandardScaler()
X = trans.fit_transform(X)
X
# <<<<<<<<<<<<< your code here


array([[ 1.02959607,  0.91499101,  0.80361776, ...,  0.19770567,
        -0.24614546, -0.40808779],
       [ 1.02959607,  0.91499101,  0.80361776, ...,  0.91139406,
         1.16451767,  0.67536772],
       [ 1.02959607,  0.91499101,  0.80361776, ..., -1.00027127,
        -0.00311989, -0.12028818],
       ...,
       [ 0.03554577,  0.63166943,  0.80361776, ...,  0.93688293,
        -0.08962807,  1.21882141],
       [ 0.03554577,  0.63166943,  0.80361776, ..., -1.1532045 ,
         0.22474446, -1.0449577 ],
       [ 0.03554577,  0.63166943,  0.80361776, ...,  0.3888722 ,
        -0.89495686,  0.60438865]])

In [9]:
# Split data into training set and test set:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


In [10]:
# Set regularization hyperparameter
reg = 0.1
# Use this model for prediction: LogisticRegression(C=1/reg, solver="liblinear")

model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)




In [11]:
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)


Accuracy: 0.864253276911962


In [12]:
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
#auc = roc_auc_score(y_test,y_scores, multi_class='ovr')

print('AUC: ' + str(auc))


AUC: 0.5950099846953074


In [14]:
# load dataset with 5 new rows
print("Loading Data")
predict = pd.read_csv('datasets\\MLcrimetype_predict.csv')

del predict['Unnamed: 0']
del predict['crime']

# print rows to predict

predict_fitted = predict.copy()
predict = predict[-5:]
predict

Loading Data


Unnamed: 0,weekdayNo,Month_no,weatherdesc,temperature_C,neighborhoodKey,lat,long
129186,1,7,scattered clouds,21.35,73,33.71893,-84.3394
129187,4,10,sky is clear,22.35,67,33.78558,-84.4556
129188,1,4,sky is clear,12.35,200,33.73686,-84.37883
129189,4,8,sky is clear,22.35,197,32.70432,-84.57693
129190,2,10,mist,17.35,200,33.72687,-84.37883


In [16]:
# classify categories and fit the data
le.fit(predict_fitted['weatherdesc'])
predict_fitted['weatherdesc'] = le.transform(predict_fitted['weatherdesc'])

predict_fitted = trans.fit_transform(predict_fitted)
predict_fitted = predict_fitted[-5:]
predict_fitted

array([[ -1.4554994 ,   0.06501564,   0.62081986,   0.97073009,
         -1.07673987,  -0.84192562,   1.44055093],
       [  0.03557515,   0.91498887,   0.80360343,   1.09105604,
         -1.15320698,   0.64095949,  -1.06624816],
       [ -1.4554994 ,  -0.78495759,   0.80360343,  -0.11220341,
          0.54181391,  -0.44300394,   0.58992194],
       [  0.03557515,   0.34834005,   0.80360343,   1.09105604,
          0.50358035, -23.41582004,  -3.68371747],
       [ -0.95847455,   0.91498887,  -0.11031446,   0.48942631,
          0.54181391,  -0.66526984,   0.58992194]])

In [17]:
#### predict case

y_pred = model.predict(predict_fitted)
print(y_pred)
print('Car stolen:')
for x in y_pred:
    if x == 1:
        print(predict.iloc[np.where(y_pred==x)])

[0 0 0 1 0]
Car stolen:
        weekdayNo  Month_no   weatherdesc  temperature_C  neighborhoodKey  \
129189          4         8  sky is clear          22.35              197   

             lat      long  
129189  32.70432 -84.57693  
