In [1]:
import numpy as np
import pandas as pd

In [2]:
all_data = pd.read_csv('./train.csv')

In [3]:
all_data

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607


## Data fields
- Dates - timestamp of the crime incident
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
- Descript - detailed description of the crime incident (only in train.csv)
- DayOfWeek - the day of the week
- PdDistrict - name of the Police Department District
- Resolution - how the crime incident was resolved (only in train.csv)
- Address - the approximate street address of the crime incident 
- X - Longitude
- Y - Latitude

In [4]:
all_data.describe()

Unnamed: 0,X,Y
count,878049.0,878049.0
mean,-122.422616,37.77102
std,0.030354,0.456893
min,-122.513642,37.707879
25%,-122.432952,37.752427
50%,-122.41642,37.775421
75%,-122.406959,37.784369
max,-120.5,90.0


In [5]:
y = all_data['Category']

In [6]:
X_train = all_data[['Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y']]

In [7]:
X_train

Unnamed: 0,Dates,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,Wednesday,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,Wednesday,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,Wednesday,PARK,-122.438738,37.771541
...,...,...,...,...,...
878044,2003-01-06 00:15:00,Monday,TARAVAL,-122.459033,37.714056
878045,2003-01-06 00:01:00,Monday,INGLESIDE,-122.447364,37.731948
878046,2003-01-06 00:01:00,Monday,SOUTHERN,-122.403390,37.780266
878047,2003-01-06 00:01:00,Monday,SOUTHERN,-122.390531,37.780607


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

num_attributes = ["X", "Y"]
cat_attributes = ["DayOfWeek"]

pipeline = ColumnTransformer([
  ("num", SimpleImputer(), num_attributes),
  ("cat", OneHotEncoder(), cat_attributes),
])

In [9]:
X_prepared = pipeline.fit_transform(X_train)

In [10]:
pd.DataFrame(X_prepared[:10])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-122.425892,37.774599,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-122.425892,37.774599,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-122.424363,37.800414,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-122.426995,37.800873,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-122.438738,37.771541,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,-122.403252,37.713431,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,-122.423327,37.725138,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,-122.371274,37.727564,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,-122.508194,37.776601,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,-122.419088,37.807802,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
from sklearn.svm import SVC

svm_clf = SVC(probability=True)
svm_clf.fit(X_prepared[:1000], y[:1000])

SVC(probability=True)

In [12]:
(svm_clf.predict_proba(X_prepared[:10]), y[:10])

(array([[0.012529  , 0.03536406, 0.03718789, 0.01252636, 0.01377033,
         0.05465288, 0.01693507, 0.01249575, 0.0224037 , 0.03050757,
         0.01700603, 0.0489194 , 0.04009707, 0.04614098, 0.0608947 ,
         0.01408343, 0.04271541, 0.01677318, 0.02947705, 0.02140976,
         0.02247999, 0.05975648, 0.02364472, 0.08089807, 0.1151976 ,
         0.0805967 , 0.03153682],
        [0.012529  , 0.03536406, 0.03718789, 0.01252636, 0.01377033,
         0.05465288, 0.01693507, 0.01249575, 0.0224037 , 0.03050757,
         0.01700603, 0.0489194 , 0.04009707, 0.04614098, 0.0608947 ,
         0.01408343, 0.04271541, 0.01677318, 0.02947705, 0.02140976,
         0.02247999, 0.05975648, 0.02364472, 0.08089807, 0.1151976 ,
         0.0805967 , 0.03153682],
        [0.01272645, 0.03561523, 0.0374248 , 0.01272354, 0.01393663,
         0.05509596, 0.0171764 , 0.01268763, 0.02261069, 0.03082308,
         0.01724423, 0.04807805, 0.04039398, 0.04641535, 0.0575024 ,
         0.01422804, 0.04256092, 0.

In [13]:
svm_clf.classes_

array(['ARSON', 'ASSAULT', 'BURGLARY', 'DISORDERLY CONDUCT',
       'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'KIDNAPPING',
       'LARCENY/THEFT', 'MISSING PERSON', 'NON-CRIMINAL',
       'OTHER OFFENSES', 'PROSTITUTION', 'ROBBERY', 'RUNAWAY',
       'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'STOLEN PROPERTY',
       'SUSPICIOUS OCC', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)

In [14]:
from sklearn import set_config

set_config(display='diagram')
pipeline

In [15]:
X_test = pd.read_csv('./test.csv')

In [17]:
X_test_prepared = pipeline.transform(X_test[['Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y']])

In [18]:
svm_clf.predict_proba(X_test_prepared[:10])

array([[0.00257175, 0.08070218, 0.05117942, 0.00256987, 0.00252001,
        0.03209895, 0.0042135 , 0.00247404, 0.00736427, 0.01592617,
        0.00513752, 0.16861419, 0.03558004, 0.1225665 , 0.14221278,
        0.0032569 , 0.03280025, 0.00318769, 0.01470964, 0.00718406,
        0.00570467, 0.03838352, 0.00940063, 0.04913824, 0.09399329,
        0.04550031, 0.0210096 ],
       [0.00277487, 0.0814455 , 0.05042376, 0.00277279, 0.00271542,
        0.03303957, 0.00446551, 0.00266776, 0.00773175, 0.01654388,
        0.00540845, 0.15217047, 0.0360759 , 0.123716  , 0.14339595,
        0.00347866, 0.03365165, 0.00339528, 0.01501219, 0.00776429,
        0.00624793, 0.03955229, 0.00986501, 0.0494703 , 0.09690693,
        0.04588512, 0.02342275],
       [0.00194326, 0.07741314, 0.04755256, 0.00194395, 0.00191198,
        0.02905365, 0.00337094, 0.00188753, 0.00576467, 0.01410724,
        0.00412609, 0.30100405, 0.02977834, 0.11690247, 0.09746707,
        0.00251176, 0.03094417, 0.00239199, 0.0148

In [19]:
X_test.describe()

Unnamed: 0,Id,X,Y
count,884262.0,884262.0,884262.0
mean,442130.5,-122.422693,37.771476
std,255264.596205,0.030985,0.484824
min,0.0,-122.513642,37.707879
25%,221065.25,-122.433069,37.752374
50%,442130.5,-122.416517,37.775421
75%,663195.75,-122.406959,37.784353
max,884261.0,-120.5,90.0


In [20]:
predict = svm_clf.predict_proba(X_test_prepared)

In [21]:
id = np.array(X_test["Id"]).astype(int)
df = pd.DataFrame(predict, id, columns = svm_clf.classes_)

In [22]:
df.to_csv("./predict.csv", index_label = ["Id"])

In [23]:
from sklearn.svm import SVC

svm_clf = SVC(probability=True)
svm_clf.fit(X_prepared, y)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ['Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y']
scatter_matrix(X_train[attributes], figsize=(12, 8))

In [19]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42, loss="log")
sgd_clf.fit(X_prepared[:1000], y[:1000])

In [20]:
(sgd_clf.predict_proba(X_prepared[:10]), y[:10])

(array([[2.56009342e-133, 0.00000000e+000, 0.00000000e+000,
         4.28565239e-094, 2.09600299e-050, 0.00000000e+000,
         0.00000000e+000, 2.37160538e-114, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
         0.00000000e+000, 4.47834072e-103, 0.00000000e+000,
         1.39255463e-091, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
        [2.56009342e-133, 0.00000000e+000, 0.00000000e+000,
         4.28565239e-094, 2.09600299e-050, 0.00000000e+000,
         0.00000000e+000, 2.37160538e-114, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
         0.00000000e+000, 4.47834072e-103, 0.00000000e+000,
         1.39255463e-091, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+0

In [21]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42, loss="log")
sgd_clf.fit(X_prepared, y)

In [22]:
(sgd_clf.predict_proba(X_prepared[:10]), y[:10])

(array([[1.04369113e-04, 1.22722297e-01, 1.78668186e-05, 5.89884268e-05,
         4.14466648e-02, 2.27084612e-02, 6.84987053e-05, 5.13064404e-02,
         5.69610189e-04, 3.39249416e-05, 5.31094101e-06, 2.34789409e-05,
         7.06229293e-04, 1.14422692e-02, 3.20438335e-06, 2.10011612e-04,
         3.67979886e-02, 6.69597447e-05, 8.13592441e-05, 4.37614351e-03,
         1.00408062e-01, 1.84604586e-01, 3.50204825e-08, 2.47943709e-03,
         3.07813240e-04, 4.52562449e-02, 1.34569539e-04, 1.66437204e-03,
         7.10170412e-04, 1.33307639e-06, 3.67522825e-04, 1.07614598e-04,
         4.29549522e-02, 6.50023776e-09, 9.54253062e-04, 3.38660206e-02,
         2.23498945e-01, 6.91688877e-02, 7.65096910e-04],
        [1.04369113e-04, 1.22722297e-01, 1.78668186e-05, 5.89884268e-05,
         4.14466648e-02, 2.27084612e-02, 6.84987053e-05, 5.13064404e-02,
         5.69610189e-04, 3.39249416e-05, 5.31094101e-06, 2.34789409e-05,
         7.06229293e-04, 1.14422692e-02, 3.20438335e-06, 2.1001161

In [23]:
sgd_clf.classes_

array(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype='<U27')

In [24]:
len(sgd_clf.classes_)

39

In [26]:
X_test = pd.read_csv('./test.csv')

In [27]:
X_test_prepared = pipeline.transform(X_test[['Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y']])

In [31]:
predict = sgd_clf.predict_proba(X_test_prepared)

In [32]:
id = np.array(X_test["Id"]).astype(int)
df = pd.DataFrame(predict, id, columns = sgd_clf.classes_)

In [None]:
df.to_csv("./predict-3.csv", index_label = ["Id"])