In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sf-crime/train.csv.zip
/kaggle/input/sf-crime/sampleSubmission.csv.zip
/kaggle/input/sf-crime/test.csv.zip


# Description
From 1934 to 1963, San Francisco was infamous for housing some of the world's most notorious criminals on the inescapable island of Alcatraz.

Today, the city is known more for its tech scene than its criminal past. But, with rising wealth inequality, housing shortages, and a proliferation of expensive digital toys riding BART to work, there is no scarcity of crime in the city by the bay.

From Sunset to SOMA, and Marina to Excelsior, this competition's dataset provides nearly 12 years of crime reports from across all of San Francisco's neighborhoods. Given time and location, you must predict the category of crime that occurred.

Reference - Wendy Kan. (2015). San Francisco Crime Classification. Kaggle. https://kaggle.com/competitions/sf-crime


# Dataset Description
This dataset contains incidents derived from SFPD Crime Incident Reporting system. The data ranges from 1/1/2003 to 5/13/2015. The training set and test set rotate every week, meaning week 1,3,5,7... belong to test set, week 2,4,6,8 belong to training set.

# Importing datasets and Data Pre-processing

In [18]:
pd.options.display.max_columns=99

In [19]:
train=pd.read_csv("/kaggle/input/sf-crime/train.csv.zip")
train

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607


In [20]:
test=pd.read_csv("/kaggle/input/sf-crime/test.csv.zip")
test

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
...,...,...,...,...,...,...,...
884257,884257,2003-01-01 00:01:00,Wednesday,MISSION,2600 Block of BRYANT ST,-122.408983,37.751987
884258,884258,2003-01-01 00:01:00,Wednesday,NORTHERN,1900 Block of WASHINGTON ST,-122.425342,37.792681
884259,884259,2003-01-01 00:01:00,Wednesday,INGLESIDE,5500 Block of MISSION ST,-122.445418,37.712075
884260,884260,2003-01-01 00:01:00,Wednesday,BAYVIEW,1500 Block of HUDSON AV,-122.387394,37.739479


In [21]:
all_data=pd.concat([train,test])

all_data["Dates"]=pd.to_datetime(all_data["Dates"])
all_data["Year"]=all_data["Dates"].dt.year
all_data["Months"]=all_data["Dates"].dt.month
all_data['Hours']= all_data['Dates'].dt.hour
all_data['Minutes']= all_data['Dates'].dt.minute

#all_data.groupby("Category")["Hours"].mean()

In [22]:
all_data2=all_data.drop(columns=["Id","Dates","Category","Descript","Resolution"])
c_features=all_data2.columns[all_data2.dtypes==object]
c_features

Index(['DayOfWeek', 'PdDistrict', 'Address'], dtype='object')

In [23]:
from sklearn.preprocessing import LabelEncoder 
le=LabelEncoder()
for i in c_features:
    all_data2[i]=le.fit_transform(all_data2[i])

In [24]:
train_2=all_data2[:len(train)]
test_2=all_data2[len(train):]

In [25]:
train_2

Unnamed: 0,DayOfWeek,PdDistrict,Address,X,Y,Year,Months,Hours,Minutes
0,6,4,20895,-122.425892,37.774599,2015,5,23,53
1,6,4,20895,-122.425892,37.774599,2015,5,23,53
2,6,4,24169,-122.424363,37.800414,2015,5,23,33
3,6,4,4418,-122.426995,37.800873,2015,5,23,30
4,6,5,1923,-122.438738,37.771541,2015,5,23,30
...,...,...,...,...,...,...,...,...,...
878044,1,8,16536,-122.459033,37.714056,2003,1,0,15
878045,1,2,11840,-122.447364,37.731948,2003,1,0,1
878046,1,7,11662,-122.403390,37.780266,2003,1,0,1
878047,1,7,23738,-122.390531,37.780607,2003,1,0,1


In [26]:
test_2

Unnamed: 0,DayOfWeek,PdDistrict,Address,X,Y,Year,Months,Hours,Minutes
0,3,0,6626,-122.399588,37.735051,2015,5,23,59
1,3,0,10069,-122.391523,37.732432,2015,5,23,51
2,3,4,6553,-122.426002,37.792212,2015,5,23,50
3,3,2,10985,-122.437394,37.721412,2015,5,23,45
4,3,2,10985,-122.437394,37.721412,2015,5,23,45
...,...,...,...,...,...,...,...,...,...
884257,6,3,7919,-122.408983,37.751987,2003,1,0,1
884258,6,4,5596,-122.425342,37.792681,2003,1,0,1
884259,6,2,11612,-122.445418,37.712075,2003,1,0,1
884260,6,0,4393,-122.387394,37.739479,2003,1,0,1


# Modelling

In [27]:
#데이터 구분해서 20%만 평가 데이터로 구성 / 80%는 학습데이터로 구성
#Seperating the data to validation and train set that 20% is the validation set and 80% is the train set 

from sklearn.model_selection import train_test_split
x_train,x_valid,y_train,y_valid=train_test_split(train_2,train["Category"],test_size=0.2,random_state=42)
x_train



Unnamed: 0,DayOfWeek,PdDistrict,Address,X,Y,Year,Months,Hours,Minutes
81381,6,3,4852,-122.419672,37.765050,2014,4,6,10
238545,3,3,8112,-122.408831,37.750390,2012,2,10,8
823641,2,8,5359,-122.475536,37.756153,2003,9,19,45
497355,5,9,17380,-122.415508,37.781654,2008,5,13,40
484193,1,9,21046,-122.412971,37.785788,2008,7,18,39
...,...,...,...,...,...,...,...,...,...
259178,1,9,1161,-122.405895,37.786734,2011,11,17,15
365838,0,7,8614,-122.396759,37.773173,2010,4,22,16
131932,2,3,6587,-122.426956,37.769247,2013,8,22,0
671155,0,0,7608,-122.386942,37.754168,2005,11,5,3


In [28]:
#Boosting model
from catboost import CatBoostClassifier
#verbose=100 100번마다결과나옴/The result comes out every 100 times 
#Hyper-parameter tuning for the iteration and learning rate to improve the model
cbc= CatBoostClassifier(task_type="GPU",verbose=100,iterations=2000,learning_rate=0.203863)

#Setting early stopping round to improve the fit of the model 
cbc.fit(x_train,y_train,eval_set=(x_valid,y_valid),early_stopping_rounds=50)

result=cbc.predict_proba(test_2)
result


0:	learn: 3.0215065	test: 3.0218172	best: 3.0218172 (0)	total: 206ms	remaining: 6m 52s
100:	learn: 2.2986715	test: 2.3129360	best: 2.3129360 (100)	total: 14.7s	remaining: 4m 36s
200:	learn: 2.2532309	test: 2.2851806	best: 2.2851806 (200)	total: 29.1s	remaining: 4m 20s
300:	learn: 2.2226414	test: 2.2723651	best: 2.2723651 (300)	total: 42.7s	remaining: 4m 1s
400:	learn: 2.1977431	test: 2.2647205	best: 2.2647205 (400)	total: 57.1s	remaining: 3m 47s
500:	learn: 2.1756597	test: 2.2590168	best: 2.2590168 (500)	total: 1m 11s	remaining: 3m 34s
600:	learn: 2.1553249	test: 2.2549399	best: 2.2549237 (598)	total: 1m 25s	remaining: 3m 18s
700:	learn: 2.1370254	test: 2.2525824	best: 2.2525824 (700)	total: 1m 39s	remaining: 3m 5s
800:	learn: 2.1196535	test: 2.2507075	best: 2.2507075 (800)	total: 1m 53s	remaining: 2m 50s
900:	learn: 2.1026466	test: 2.2499149	best: 2.2498544 (873)	total: 2m 8s	remaining: 2m 37s
1000:	learn: 2.0868666	test: 2.2491264	best: 2.2491264 (1000)	total: 2m 22s	remaining: 2m 22

array([[6.69152455e-04, 5.19639282e-02, 8.41250688e-08, ...,
        3.99990239e-01, 9.70483795e-03, 8.58736278e-03],
       [2.55873190e-04, 6.33654938e-02, 1.69601718e-08, ...,
        9.85841282e-03, 5.27742097e-02, 2.68551321e-02],
       [5.34823879e-03, 1.96597137e-01, 3.05510172e-06, ...,
        3.14673885e-02, 2.67075625e-02, 5.34920875e-03],
       ...,
       [1.03518920e-03, 1.69083965e-01, 4.00921474e-04, ...,
        1.67937284e-01, 3.84375090e-02, 1.39468632e-03],
       [1.98455547e-04, 8.04402801e-02, 2.44264886e-03, ...,
        5.46414109e-02, 2.59907859e-02, 3.99205466e-03],
       [8.17923486e-05, 2.49068520e-02, 8.41380276e-04, ...,
        1.48267291e-01, 1.13242523e-02, 9.46838110e-04]])

In [29]:
sub=pd.read_csv("/kaggle/input/sf-crime/sampleSubmission.csv.zip")
sub

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,884257,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
884258,884258,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
884259,884259,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
884260,884260,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [30]:
#column 접근하는법
#대괄호 데이터접근  
#Subsetting the result column
sub.iloc[:,1:]=result
sub

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.000669,0.051964,8.412507e-08,1.050890e-04,0.081648,0.001171,0.002606,0.014938,0.000784,0.002209,0.000007,0.000052,0.000255,0.003723,5.023356e-06,0.000446,0.080793,0.000024,0.000014,0.030569,0.079164,0.132011,1.982307e-07,0.000004,2.077846e-05,0.012660,0.000011,0.004608,0.000150,2.460533e-07,0.001618,0.000217,0.045669,9.456130e-07,0.004923,0.028679,0.399990,0.009705,0.008587
1,1,0.000256,0.063365,1.696017e-08,1.001086e-03,0.006552,0.001483,0.007013,0.038760,0.003089,0.000018,0.000004,0.000009,0.000629,0.001381,1.993045e-05,0.000251,0.007384,0.000115,0.000118,0.002173,0.025823,0.645090,1.148490e-07,0.000001,3.154642e-06,0.053560,0.000001,0.004105,0.000193,5.762112e-08,0.004989,0.000012,0.032561,1.029371e-06,0.002612,0.007938,0.009858,0.052774,0.026855
2,2,0.005348,0.196597,3.055102e-06,2.816922e-04,0.056505,0.000367,0.000413,0.018765,0.012374,0.000018,0.000007,0.000037,0.002855,0.009068,8.745903e-07,0.012249,0.265296,0.000033,0.000061,0.007798,0.062559,0.051550,2.099880e-07,0.007657,2.362093e-06,0.014552,0.000034,0.010033,0.019568,2.635853e-06,0.007253,0.000039,0.057206,1.776334e-07,0.013394,0.104550,0.031467,0.026708,0.005349
3,3,0.000419,0.222151,2.018363e-07,1.104827e-03,0.021123,0.001642,0.001018,0.020354,0.015125,0.000009,0.000078,0.001343,0.001272,0.018062,2.588844e-04,0.002715,0.115081,0.001257,0.000057,0.014708,0.054910,0.122333,9.306824e-08,0.000013,7.756531e-06,0.058577,0.000072,0.006376,0.002660,2.372182e-06,0.004329,0.000003,0.071142,7.739724e-07,0.001774,0.040736,0.152451,0.028499,0.018336
4,4,0.000419,0.222151,2.018363e-07,1.104827e-03,0.021123,0.001642,0.001018,0.020354,0.015125,0.000009,0.000078,0.001343,0.001272,0.018062,2.588844e-04,0.002715,0.115081,0.001257,0.000057,0.014708,0.054910,0.122333,9.306824e-08,0.000013,7.756531e-06,0.058577,0.000072,0.006376,0.002660,2.372182e-06,0.004329,0.000003,0.071142,7.739724e-07,0.001774,0.040736,0.152451,0.028499,0.018336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,884257,0.000398,0.074348,5.905100e-03,9.080895e-05,0.032361,0.001657,0.009058,0.024783,0.001207,0.004181,0.000011,0.000357,0.212750,0.032175,4.509593e-06,0.000366,0.085349,0.000646,0.000157,0.007141,0.074956,0.107162,3.459790e-08,0.000443,3.075494e-07,0.022216,0.000935,0.006663,0.038835,3.793782e-04,0.003457,0.000260,0.068104,6.942547e-08,0.006035,0.066123,0.094528,0.014995,0.001960
884258,884258,0.000066,0.038665,3.730233e-03,2.086366e-07,0.092838,0.013426,0.000458,0.005639,0.000657,0.013365,0.000033,0.000505,0.078912,0.076667,2.225867e-05,0.000618,0.138283,0.000062,0.000846,0.018550,0.072408,0.085146,2.448166e-07,0.005692,1.416425e-07,0.003154,0.000092,0.008252,0.013653,1.210652e-05,0.018038,0.000019,0.103408,6.237108e-08,0.009609,0.088027,0.098393,0.010556,0.000201
884259,884259,0.001035,0.169084,4.009215e-04,3.886957e-04,0.029781,0.002995,0.000056,0.022669,0.000709,0.012106,0.000710,0.000671,0.104928,0.057565,5.715790e-06,0.000462,0.088197,0.000086,0.000012,0.009133,0.032921,0.120017,3.450350e-08,0.002915,8.558950e-07,0.007980,0.000272,0.003082,0.047496,8.476437e-03,0.001398,0.001678,0.021699,3.327100e-07,0.002408,0.040891,0.167937,0.038438,0.001395
884260,884260,0.000198,0.080440,2.442649e-03,3.463426e-05,0.034232,0.049812,0.000024,0.007368,0.000415,0.042078,0.000013,0.001297,0.364432,0.011265,8.975639e-06,0.000220,0.059095,0.000054,0.003245,0.004068,0.042011,0.106663,6.538766e-08,0.000496,2.661933e-06,0.001035,0.000094,0.004370,0.007062,3.295483e-03,0.000933,0.000171,0.041870,2.005452e-07,0.001599,0.045030,0.054641,0.025991,0.003992


In [32]:
sub.to_csv("sub12345.csv",index=0)

Private Score:2.27518
Public Score:2.27518
Approximately top 7% of all submissions