# 데이터 전처리 - 피처 엔지니어링 - 모델훈련 - 결과 예측

In [43]:
# 데이터 위치
# https://drive.google.com/drive/folders/1bnxQ55Y0IeuS0wHXyPd_r7Q6-8R5qBUN?usp=sharing

import pandas as pd
url = 'C:/python_src2/data/San-Francisco-Crime.csv'
df = pd.read_csv(url)
df.shape

(878049, 9)

In [44]:
df.describe(include='object')

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address
count,878049,878049,878049,878049,878049,878049,878049
unique,389257,39,879,7,10,17,23228
top,2011-01-01 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,SOUTHERN,NONE,800 Block of BRYANT ST
freq,185,174900,60022,133734,157182,526790,26533


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [46]:
# 불필요한 컬럼제거
train = df.drop(columns=['Descript','Resolution','Address'])
train.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,-122.438738,37.771541


In [47]:
# 범주형 데이터 변환
# 종속변수 Category 인코딩
from sklearn.preprocessing import LabelEncoder
train.Category =  LabelEncoder().fit_transform(train.Category)
train.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,37,Wednesday,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,21,Wednesday,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,21,Wednesday,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,16,Wednesday,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,16,Wednesday,PARK,-122.438738,37.771541


In [48]:
# 경찰서 관할구역 PdDistrict - oneHot
district_dummies =  pd.get_dummies(train.PdDistrict,prefix='Distirict').astype(int)
# 합치기
train = pd.concat([train,district_dummies], axis=1)
train = train.drop(columns=['PdDistrict'])
train.head()

Unnamed: 0,Dates,Category,DayOfWeek,X,Y,Distirict_BAYVIEW,Distirict_CENTRAL,Distirict_INGLESIDE,Distirict_MISSION,Distirict_NORTHERN,Distirict_PARK,Distirict_RICHMOND,Distirict_SOUTHERN,Distirict_TARAVAL,Distirict_TENDERLOIN
0,2015-05-13 23:53:00,37,Wednesday,-122.425892,37.774599,0,0,0,0,1,0,0,0,0,0
1,2015-05-13 23:53:00,21,Wednesday,-122.425892,37.774599,0,0,0,0,1,0,0,0,0,0
2,2015-05-13 23:33:00,21,Wednesday,-122.424363,37.800414,0,0,0,0,1,0,0,0,0,0
3,2015-05-13 23:30:00,16,Wednesday,-122.426995,37.800873,0,0,0,0,1,0,0,0,0,0
4,2015-05-13 23:30:00,16,Wednesday,-122.438738,37.771541,0,0,0,0,0,1,0,0,0,0


In [49]:
train['Year'] =  pd.to_datetime(train.Dates).dt.year
train['Month'] =  pd.to_datetime(train.Dates).dt.month
train['Day'] =  pd.to_datetime(train.Dates).dt.day
train['Hour'] =  pd.to_datetime(train.Dates).dt.hour
train['Minute'] =  pd.to_datetime(train.Dates).dt.minute

train = train.drop(columns=['Dates'])

# one hot
dayofweek_onehot = pd.get_dummies(train.DayOfWeek, prefix='Day').astype(int)
train = pd.concat([ train, dayofweek_onehot], axis=1)
train = train.drop(columns=['DayOfWeek'])
train.head()

Unnamed: 0,Category,X,Y,Distirict_BAYVIEW,Distirict_CENTRAL,Distirict_INGLESIDE,Distirict_MISSION,Distirict_NORTHERN,Distirict_PARK,Distirict_RICHMOND,...,Day,Hour,Minute,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
0,37,-122.425892,37.774599,0,0,0,0,1,0,0,...,13,23,53,0,0,0,0,0,0,1
1,21,-122.425892,37.774599,0,0,0,0,1,0,0,...,13,23,53,0,0,0,0,0,0,1
2,21,-122.424363,37.800414,0,0,0,0,1,0,0,...,13,23,33,0,0,0,0,0,0,1
3,16,-122.426995,37.800873,0,0,0,0,1,0,0,...,13,23,30,0,0,0,0,0,0,1
4,16,-122.438738,37.771541,0,0,0,0,0,1,0,...,13,23,30,0,0,0,0,0,0,1


In [50]:
from sklearn.model_selection import train_test_split
# 훈련 데이터 분리
X = train.drop(columns=['Category']).to_numpy()
y = train.Category

x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=42)
x_train.shape ,  y_train.shape

((702439, 24), (702439,))

In [51]:
# 모델 생성 및 학습

In [53]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()  # 과적합 가능성이 크다
# 모델 학습
model.fit(x_train,y_train)
# 평가 정확도
model.score(x_train,y_train),  model.score(x_test,y_test)

(0.8778071832571939, 0.23970161152553954)

In [54]:
# 하이퍼 파라메터 튜닝
from sklearn.model_selection import GridSearchCV
param_grid = {
   'max_depth':[3,5,7,10,15],
   'min_samples_split' : [2,10,20],  # 노드를 분할하기위한 최소 샘플수(복잡도 제어)
   'min_samples_leaf' : [1,5,10], # 리프노드를 나누기 위한 최소 샘플수
   'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV( 
    DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=5,
    return_train_score=True,
    n_jobs=-1  # 모든 cpu코어 사용
 )
grid_search.fit(x_train,y_train)

0,1,2
,estimator,DecisionTreeClassifier()
,param_grid,"{'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [2, 10, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,15
,min_samples_split,20
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [58]:
# 최적의 하이퍼 파라메터
grid_search.best_params_, grid_search.best_score_

({'criterion': 'gini',
  'max_depth': 15,
  'min_samples_leaf': 1,
  'min_samples_split': 20},
 np.float64(0.27907761329165426))

In [60]:
results = pd.DataFrame(grid_search.cv_results_)
results = results.sort_values(by = 'rank_test_score')
results = results[[
    'params',
    'mean_train_score',
    'mean_test_score',
    'rank_test_score'
]]
print('GridSearchCV 결과')
results.head()

GridSearchCV 결과


Unnamed: 0,params,mean_train_score,mean_test_score,rank_test_score
38,"{'criterion': 'gini', 'max_depth': 15, 'min_sa...",0.332747,0.279078,1
41,"{'criterion': 'gini', 'max_depth': 15, 'min_sa...",0.331615,0.279002,2
37,"{'criterion': 'gini', 'max_depth': 15, 'min_sa...",0.339185,0.278427,3
42,"{'criterion': 'gini', 'max_depth': 15, 'min_sa...",0.329468,0.278421,4
44,"{'criterion': 'gini', 'max_depth': 15, 'min_sa...",0.329467,0.278417,5


In [None]:
from sklearn.metrics import classification_report
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
print( classification_report(y_test,y_pred)  )

In [None]:
# 가중치 조절
# 클래스 불균형이 심한경우. 소수의 클래스에 가중치를 더 부여
# 사이킷런 계열은 대부분 class_weight 존재...
best_params = grid_search.best_params_
best_params['class_weight'] = 'balanced'
balanced_tree = DecisionTreeClassifier(**best_params)
balanced_tree.fit(x_train,y_train)
y_pred = balanced_tree.predict(x_test)
print( classification_report(y_test,y_pred)  )

In [None]:
# 데이터 증강
# %pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
smote =  SMOTE(random_state=42,k_neighbors=4)
# 훈련데이터에 대해서만 적용
x_train_over, y_train_over =  smote.fit_resample(x_train,y_train)
y_train_over.value_counts()

In [73]:
# best_params 재 사용
over_tree = DecisionTreeClassifier(**best_params)
over_tree.fit(x_train_over, y_train_over)
over_tree.score(x_test,y_test)

0.18771710039291611

In [None]:
# EDA
