In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [2]:
train_data

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607


In [3]:
test_data

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
...,...,...,...,...,...,...,...
884257,884257,2003-01-01 00:01:00,Wednesday,MISSION,2600 Block of BRYANT ST,-122.408983,37.751987
884258,884258,2003-01-01 00:01:00,Wednesday,NORTHERN,1900 Block of WASHINGTON ST,-122.425342,37.792681
884259,884259,2003-01-01 00:01:00,Wednesday,INGLESIDE,5500 Block of MISSION ST,-122.445418,37.712075
884260,884260,2003-01-01 00:01:00,Wednesday,BAYVIEW,1500 Block of HUDSON AV,-122.387394,37.739479


In [4]:
train_data['Category'].value_counts()

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

In [5]:
train_data['Resolution'].value_counts()

NONE                                      526790
ARREST, BOOKED                            206403
ARREST, CITED                              77004
LOCATED                                    17101
PSYCHOPATHIC CASE                          14534
UNFOUNDED                                   9585
JUVENILE BOOKED                             5564
COMPLAINANT REFUSES TO PROSECUTE            3976
DISTRICT ATTORNEY REFUSES TO PROSECUTE      3934
NOT PROSECUTED                              3714
JUVENILE CITED                              3332
PROSECUTED BY OUTSIDE AGENCY                2504
EXCEPTIONAL CLEARANCE                       1530
JUVENILE ADMONISHED                         1455
JUVENILE DIVERTED                            355
CLEARED-CONTACT JUVENILE FOR MORE INFO       217
PROSECUTED FOR LESSER OFFENSE                 51
Name: Resolution, dtype: int64

In [6]:
for df in [train_data, test_data]:
    df['Year'] = pd.to_datetime(df['Dates']).dt.year
    df['Month'] = pd.to_datetime(df['Dates']).dt.month
    df['Day'] = pd.to_datetime(df['Dates']).dt.day
    df['Hour'] = pd.to_datetime(df['Dates']).dt.hour
    df['Minute'] = pd.to_datetime(df['Dates']).dt.minute
    df['DayOfWeek'] = df['DayOfWeek'].astype('category').cat.codes

In [7]:
train_data = pd.get_dummies(train_data, columns=['PdDistrict'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['PdDistrict'], drop_first=True)

In [8]:
train_data = train_data.drop(columns=['Dates', 'Descript', 'Resolution', 'Address'])
test_data = test_data.drop(columns=['Dates', 'Address', 'Id'])

In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train_data['Category'] = encoder.fit_transform(train_data['Category'])

X_train = train_data.drop(columns=['Category'])
y_train = train_data['Category']
X_test = test_data.copy()

In [10]:
from xgboost import XGBClassifier

model = XGBClassifier(objective='multi:softmax', num_class=len(encoder.classes_), eval_metric='mlogloss')

model.fit(X_train, y_train)

In [11]:
y_test_pred = model.predict(X_test)

predicted_categories = encoder.inverse_transform(y_test_pred)

In [12]:
output = test_data.copy()
output['Category'] = predicted_categories

In [13]:
output['Category'].value_counts()

LARCENY/THEFT                  380658
OTHER OFFENSES                 241149
DRUG/NARCOTIC                   66302
NON-CRIMINAL                    56141
VEHICLE THEFT                   53627
ASSAULT                         37748
MISSING PERSON                  15105
PROSTITUTION                    10101
BURGLARY                         5712
VANDALISM                        4431
WARRANTS                         3739
FORGERY/COUNTERFEITING           3147
FRAUD                            1461
TRESPASS                          853
RUNAWAY                           834
DISORDERLY CONDUCT                813
ROBBERY                           696
LOITERING                         389
SUSPICIOUS OCC                    325
SECONDARY CODES                   243
SEX OFFENSES FORCIBLE             117
WEAPON LAWS                       111
DRUNKENNESS                       103
ARSON                              97
LIQUOR LAWS                        84
EMBEZZLEMENT                       73
RECOVERED VE

In [14]:
output.to_csv('predicted_categories.csv', index=False)