In [139]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [140]:
train = pd.read_csv('train.csv')
tune = pd.read_csv('tune.csv')

In [141]:
train.arrest.value_counts()

False    250616
True     101965
Name: arrest, dtype: int64

In [142]:
tune.arrest.value_counts()

False    71495
True     23695
Name: arrest, dtype: int64

In [129]:
train.isna().sum()

unique_key              0
case_number             0
date                    0
block                   0
iucr                    0
primary_type            0
description             0
location_description    0
arrest                  0
domestic                0
beat                    0
district                0
ward                    0
community_area          0
fbi_code                0
x_coordinate            0
y_coordinate            0
year                    0
updated_on              0
latitude                0
longitude               0
location                0
DateTime                0
time                    0
day                     0
month                   0
season                  0
crimes_per_year         0
prev_crimes_per_year    0
primary_type_grouped    0
dtype: int64

In [130]:
tune.isna().sum()

unique_key              0
case_number             0
date                    0
block                   0
iucr                    0
primary_type            0
description             0
location_description    0
arrest                  0
domestic                0
beat                    0
district                0
ward                    0
community_area          0
fbi_code                0
x_coordinate            0
y_coordinate            0
year                    0
updated_on              0
latitude                0
longitude               0
location                0
DateTime                0
time                    0
day                     0
month                   0
season                  0
crimes_per_year         0
prev_crimes_per_year    0
primary_type_grouped    0
dtype: int64

In [131]:
train.dtypes

unique_key                int64
case_number              object
date                     object
block                    object
iucr                     object
primary_type             object
description              object
location_description     object
arrest                     bool
domestic                   bool
beat                      int64
district                float64
ward                    float64
community_area          float64
fbi_code                 object
x_coordinate            float64
y_coordinate            float64
year                      int64
updated_on               object
latitude                float64
longitude               float64
location                 object
DateTime                 object
time                     object
day                       int64
month                     int64
season                   object
crimes_per_year           int64
prev_crimes_per_year      int64
primary_type_grouped     object
dtype: object

In [29]:
data = pd.read_csv('data.csv')

In [None]:
train.columns

In [31]:
zzz = pd.DataFrame(data.district.unique())

In [35]:
zzz.to_csv('districts.csv', index=False)

In [3]:
train.columns

Index(['unique_key', 'case_number', 'date', 'block', 'iucr', 'primary_type',
       'description', 'location_description', 'arrest', 'domestic', 'beat',
       'district', 'ward', 'community_area', 'fbi_code', 'x_coordinate',
       'y_coordinate', 'year', 'updated_on', 'latitude', 'longitude',
       'location', 'DateTime', 'time', 'day', 'month', 'season',
       'crimes_per_year'],
      dtype='object')

In [4]:
train['DateTime'] = pd.to_datetime(train['date'])
tune['DateTime'] = pd.to_datetime(tune['date'])


train['hour'] = train['DateTime'].dt.hour
tune['hour'] = tune['DateTime'].dt.hour

In [113]:
train = train[['primary_type_grouped','description','location_description','domestic','year','district','crimes_per_year','season','ward','community_area','month','x_coordinate','y_coordinate']]
tune = tune[['primary_type_grouped','description','location_description','domestic','year','district','crimes_per_year','season','ward','community_area','month','x_coordinate','y_coordinate']]

In [84]:
train_labels = train['district']
tune_labels = tune['district']

In [85]:
train.drop('district', inplace=True,axis=1)
tune.drop('district', inplace=True,axis=1)

In [86]:
#train['primary_type'] = pd.factorize(train['primary_type'])[0]
train['location_description'] = pd.factorize(train['location_description'])[0]
train['description'] = pd.factorize(train['description'])[0]

#tune['primary_type'] = pd.factorize(tune['primary_type'])[0]
tune['location_description'] = pd.factorize(tune['location_description'])[0]
tune['description'] = pd.factorize(tune['description'])[0]

In [87]:
train = pd.get_dummies(train)
tune = pd.get_dummies(tune)

In [89]:
model = LogisticRegression().fit(train, train_labels)
y_pred = model.predict(tune)
score = model.score(tune, tune_labels)
print(f"Score: {score}")

Score: 0.24568969996778317


In [90]:
model = DecisionTreeClassifier().fit(train, train_labels)
y_pred = model.predict(tune)
score = model.score(tune, tune_labels)
print(f"Score: {score}")

Score: 0.8154910988017418


In [91]:
model = RandomForestClassifier().fit(train, train_labels)
y_pred = model.predict(tune)
score = model.score(tune, tune_labels)
print(f"Score: {score}")

Score: 0.8656869979111024


In [100]:
def fill(val, fill_in):
    
    if pd.isnull(val):
        return fill_in
    else:
        return val

In [118]:
train['primary_type_grouped'].isna().sum()

KeyError: 'primary_type_grouped'

In [120]:
train_labels = pd.factorize(train['primary_type_grouped'])[0]


In [122]:
train = train[~pd.isnull(train['x_coordinate'])]

In [125]:
tune = tune[~pd.isnull(tune['x_coordinate'])]

In [137]:
train.isna().sum()

unique_key              0
case_number             0
date                    0
block                   0
iucr                    0
primary_type            0
description             0
location_description    0
arrest                  0
domestic                0
beat                    0
district                0
ward                    0
community_area          0
fbi_code                0
x_coordinate            0
y_coordinate            0
year                    0
updated_on              0
latitude                0
longitude               0
location                0
DateTime                0
time                    0
day                     0
month                   0
season                  0
crimes_per_year         0
prev_crimes_per_year    0
primary_type_grouped    0
dtype: int64

In [127]:
tune.isna().sum()

unique_key              0
case_number             0
date                    0
block                   0
iucr                    0
primary_type            0
description             0
location_description    0
arrest                  0
domestic                0
beat                    0
district                0
ward                    0
community_area          0
fbi_code                0
x_coordinate            0
y_coordinate            0
year                    0
updated_on              0
latitude                0
longitude               0
location                0
DateTime                0
time                    0
day                     0
month                   0
season                  0
crimes_per_year         0
prev_crimes_per_year    0
primary_type_grouped    4
dtype: int64

In [135]:
len(train.block.unique())

40421

In [143]:
train = train[['arrest','primary_type_grouped','description','location_description','domestic','year','crimes_per_year','season','month','beat','district','community_area','ward','block','x_coordinate','y_coordinate']]
tune = tune[['arrest','primary_type_grouped','description','location_description','domestic','year','crimes_per_year','season','month','beat','district','community_area','ward','block','x_coordinate','y_coordinate']]

train_labels = pd.factorize(train['arrest'])[0]
tune_labels = pd.factorize(tune['arrest'])[0]

train.drop('arrest', inplace=True,axis=1)
tune.drop('arrest', inplace=True,axis=1)

#train['primary_type'] = pd.factorize(train['primary_type'])[0]
train['location_description'] = pd.factorize(train['location_description'])[0]
train['description'] = pd.factorize(train['description'])[0]
train['block'] = pd.factorize(train['block'])[0]


#tune['primary_type'] = pd.factorize(tune['primary_type'])[0]
tune['location_description'] = pd.factorize(tune['location_description'])[0]
tune['description'] = pd.factorize(tune['description'])[0]
tune['block'] = pd.factorize(tune['block'])[0]

train = pd.get_dummies(train)
tune = pd.get_dummies(tune)

model = LogisticRegression().fit(train, train_labels)
y_pred = model.predict(tune)
score = model.score(tune, tune_labels)
print(f"Logistic Regression Score: {score}")

model = DecisionTreeClassifier().fit(train, train_labels)
y_pred = model.predict(tune)
score = model.score(tune, tune_labels)
print(f"Decision Tree Score: {score}")

model = RandomForestClassifier().fit(train, train_labels)
y_pred = model.predict(tune)
score = model.score(tune, tune_labels)
print(f"Random Forest Score: {score}")

Logistic Regression Score: 0.7510767937808593
Decision Tree Score: 0.694453198865427
Random Forest Score: 0.8203907973526631


In [99]:
train_labels.isna().sum()

18