**<San Francisco Crime Classification - Predict the category of crimes that occurred in the city by the bay>**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train = pd.read_csv('/kaggle/input/sf-crime/train.csv.zip')

In [3]:
test = pd.read_csv('/kaggle/input/sf-crime/test.csv.zip')

In [4]:
display(train,test)

In [5]:
alldata = pd.concat([train,test])
alldata

In [6]:
alldata['Dates'] = pd.to_datetime(alldata['Dates'])
alldata['year'] = alldata['Dates'].dt.year
alldata['month'] = alldata['Dates'].dt.month
alldata['day'] = alldata['Dates'].dt.day
alldata['week'] = alldata['Dates'].dt.week
alldata['hour'] = alldata['Dates'].dt.hour
alldata['minute'] = alldata['Dates'].dt.minute
alldata['time'] = (alldata['Dates'].dt.date-alldata['Dates'].dt.date.min()).apply(lambda x: x.days)
alldata

In [7]:
# 주소의 경우 종류가 너무 많음. 단어가 조금이라도 다르면 아예 다른 숫자로 라벨 인코딩이 되기 때문에 근처에 있다는 것을 반영해줄 수가 없음. 
# 따라서 텍스트 마이닝을 통해 모든 단어를 추출하고 가중치를 부여 (tfidf)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text = tfidf.fit_transform(alldata['Address'])
text

In [12]:
print(text[0])

In [13]:
# column이 너무 많음. 차원 축소 필요. (PCA, SVD 등 활용). 축소하면 정보가 조금 날아가긴 하지만 2000개를 다 추가할 순 없음. 적당히 정보 압축해서 사용.

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=10)  # n_componets=10: 열을 10개로 줄인다
text_svd = svd.fit_transform(text)
text_svd

In [14]:
text_svd.shape

In [15]:
for i in range(10):
    alldata['text_svd'+str(i)] = text_svd[:,i]
alldata

In [16]:
alldata2 = alldata.drop(columns=['Dates','Category','Descript','Resolution','Id'])
alldata2

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

c = alldata2.columns[alldata2.dtypes==object]

for i in c:
    alldata2[i] = le.fit_transform(alldata2[i])

alldata2

In [18]:
train2 = alldata2[:len(train)]
test2 = alldata2[len(train):]

In [19]:
# 제출 안 해도 결과 확인 할 수 있도록 train 데이터를 8대2로 나눠서 2로 테스트 

from sklearn.model_selection import train_test_split
x_train,x_valid,y_train,y_valid = train_test_split(train2,train['Category'],test_size=0.2,random_state=42,stratify=train['Category'])

In [20]:
alldata['Category'].value_counts()

In [None]:
from catboost import CatBoostClassifier  # 카테고리 많을 때 사용
cbc = CatBoostClassifier(task_type='GPU',verbose=30,learning_rate=0.1,iterations=10000)  # iterations: 일단 반복 횟수 많이. 아래에서 멈추는 옵션 줄거니까
cbc.fit(x_train,y_train,eval_set=(x_valid,y_valid),early_stopping_rounds=30)  # eval_set: 점수 실시간으로 보기, early_stopping_rounds=30: 점수 개선이 더 이상 안되는 것이 30번이 되면 멈춘다
result = cbc.predict_proba(test2)

In [None]:
sub = pd.read_csv('/kaggle/input/sf-crime/sampleSubmission.csv.zip')
sub

In [None]:
sub.iloc[:,1:] = result
sub

In [None]:
sub.to_csv('sub.csv',index=0)