In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
## 원핫인코딩한 csv파일 읽기
DATA = pd.read_csv(r"C:/Users/dong/Desktop/R/data/Onehot.csv" , encoding = "CP949" , sep=",")

In [3]:
## train , test 데이터로 분류
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(
DATA.drop('DLY', axis=1) , DATA['DLY'] , random_state = 0)
print("X_train : " , X_train.shape)
print("y_train : " , y_train.shape)
print("X_test : " , X_test.shape)
print("y_train : " , y_test.shape)

X_train :  (740781, 117)
y_train :  (740781,)
X_test :  (246928, 117)
y_train :  (246928,)


In [4]:
##모델 적용해보기

# 로지스틱  
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C = 1).fit(X_train , y_train)
pred_log = logreg.predict(X_test)
print("훈련 : " , logreg.score(X_train , y_train))
print("테스트 : " , logreg.score(X_test , y_test))
print(classification_report(y_test , pred_log))


훈련 :  0.8796351418300415
테스트 :  0.8795478844035508
              precision    recall  f1-score   support

           0       0.88      1.00      0.94    217173
           1       0.83      0.00      0.00     29755

    accuracy                           0.88    246928
   macro avg       0.86      0.50      0.47    246928
weighted avg       0.87      0.88      0.82    246928



In [5]:
# 랜덤 포레스트1

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 20 , random_state = 0)
forest.fit(X_train , y_train)
pred_forest = forest.predict(X_test)
print("훈련 : " , forest.score(X_train , y_train))
print("테스트 : " , forest.score(X_test , y_test))
print(classification_report(y_test , pred_forest))

훈련 :  0.991251125501329
테스트 :  0.8794466403162056
              precision    recall  f1-score   support

           0       0.89      0.98      0.93    217173
           1       0.50      0.13      0.21     29755

    accuracy                           0.88    246928
   macro avg       0.70      0.56      0.57    246928
weighted avg       0.84      0.88      0.85    246928



In [7]:
# 랜덤 포레스트12  

forest = RandomForestClassifier(n_estimators = 100 , random_state = 0 , max_features = 2 , max_depth=50)
forest.fit(X_train , y_train)
pred_forest = forest.predict(X_test)
print("훈련 : " , forest.score(X_train , y_train))
print("테스트 : " , forest.score(X_test , y_test))
print(classification_report(y_test , pred_forest))
print("\nf1 스코어 : " , f1_score(y_test , pred_forest))

훈련 :  0.9901144872776164
테스트 :  0.880770912978682
              precision    recall  f1-score   support

           0       0.89      0.99      0.94    217173
           1       0.53      0.08      0.14     29755

    accuracy                           0.88    246928
   macro avg       0.71      0.54      0.54    246928
weighted avg       0.84      0.88      0.84    246928


f1 스코어 :  0.14273651107940483


In [11]:

## 데이터의 DLY 값이 0인 경우가 과하게 많아 분류가 제대로 되지 않는 것을 알 수 있다. -> 비대칭 데이터
confusion = confusion_matrix(y_test , pred_forest)
print("행렬 :\n" , confusion)

행렬 :
 [[215036   2137]
 [ 27304   2451]]


In [12]:
from imblearn.over_sampling import *
from imblearn.under_sampling import *
## DLY가 1인 소수 데이터를 증가시키는 오버 샘플링을 통해 정밀도 precision 향상 시도

X_smo_t , y_smo_t = SMOTE(random_state = 0  ).fit_sample(X_train , y_train)

print(X_smo_t.shape)
print(y_smo_t.shape)

(1303198, 117)
(1303198,)


In [13]:
#랜덤포레스트 - class_weight = balanced

forest = RandomForestClassifier(n_estimators = 100 , random_state = 0 , max_features = 2 , max_depth=50 , class_weight = 'balanced')
forest.fit(X_smo_t, y_smo_t)
pred1 = forest.predict(X_test)
print("훈련 : " , forest.score(X_smo_t, y_smo_t))
print("테스트 : " , forest.score(X_test , y_test))
print(classification_report(y_test , pred1))
print("\nf1 스코어 : " , f1_score(y_test , pred1))


훈련 :  0.9972007323522596
테스트 :  0.8582258796086308
              precision    recall  f1-score   support

           0       0.92      0.92      0.92    217173
           1       0.41      0.40      0.41     29755

    accuracy                           0.86    246928
   macro avg       0.66      0.66      0.66    246928
weighted avg       0.86      0.86      0.86    246928


f1 스코어 :  0.407547808427822


In [None]:
# 그래디언트 부스팅 회귀

from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(random_state = 0  , learning_rate = 0.1)
gbrt.fit(X_smo_t , y_smo_t , class_weight = 'balanced')
pred2 = gbrt.predict(X_test)
print("  훈련 : " , gbrt.score(X_smo_t , y_smo_t))
print("  테스트 : " , gbrt.score(X_test, y_test))
print(classification_report(y_test , pred2))
print("\nf1 스코어 : " , f1_score(y_test , pred2))

In [14]:
# 로지스틱 회귀

logreg = LogisticRegression(C = 1).fit(X_smo_t , y_smo_t)
pred3 = logreg.predict(X_test)
print("훈련 : " , logreg.score(X_smo_t , y_smo_t))
print("테스트 : " , logreg.score(X_test , y_test))
print(classification_report(y_test , pred3))
print("\nf1 스코어 : " , f1_score(y_test , pred3))

훈련 :  0.927317261076214
테스트 :  0.8795316853495756
              precision    recall  f1-score   support

           0       0.88      1.00      0.94    217173
           1       0.56      0.00      0.00     29755

    accuracy                           0.88    246928
   macro avg       0.72      0.50      0.47    246928
weighted avg       0.84      0.88      0.82    246928


f1 스코어 :  0.002481472787632876


In [15]:
# 나이브 베이즈 이진분류
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB(alpha = 100 , class_prior = None,fit_prior=True)
nb.fit(X_smo_t , y_smo_t)
pred4 = nb.predict(X_test)
print("훈련 : " , nb.score(X_smo_t , y_smo_t))
print("테스트 : " , nb.score(X_test , y_test))
print(classification_report(y_test , pred4))
print("\nf1 스코어 : " , f1_score(y_test , pred4))

훈련 :  0.7063278181826553
테스트 :  0.6797366033823625
              precision    recall  f1-score   support

           0       0.93      0.69      0.79    217173
           1       0.21      0.62      0.32     29755

    accuracy                           0.68    246928
   macro avg       0.57      0.65      0.55    246928
weighted avg       0.84      0.68      0.73    246928


f1 스코어 :  0.31817633162623077


In [16]:
# 랜덤 포레스트2


forest = RandomForestClassifier(n_estimators = 20 , random_state = 0)
forest.fit(X_smo_t , y_smo_t)
pred5 = forest.predict(X_test)
print("훈련 : " , forest.score(X_smo_t , y_smo_t))
print("테스트 : " , forest.score(X_test , y_test))
print(classification_report(y_test , pred5))
print("\nf1 스코어 : " , f1_score(y_test , pred5))

훈련 :  0.9972705605748321
테스트 :  0.8630734465107238
              precision    recall  f1-score   support

           0       0.91      0.94      0.92    217173
           1       0.41      0.31      0.36     29755

    accuracy                           0.86    246928
   macro avg       0.66      0.63      0.64    246928
weighted avg       0.85      0.86      0.85    246928


f1 스코어 :  0.3556741305383516


In [17]:
#랜덤포레스트 - class_weight = balanced

forest = RandomForestClassifier(n_estimators = 100 , random_state = 0 , max_features = 2 , max_depth=40 , class_weight = 'balanced')
forest.fit(X_smo_t, y_smo_t)
pred1 = forest.predict(X_test)
print("훈련 : " , forest.score(X_smo_t, y_smo_t))
print("테스트 : " , forest.score(X_test , y_test))
print(classification_report(y_test , pred1))
print("\nf1 스코어 : " , f1_score(y_test , pred1))


훈련 :  0.9809115729152439
테스트 :  0.8416906952633966
              precision    recall  f1-score   support

           0       0.93      0.89      0.91    217173
           1       0.38      0.48      0.42     29755

    accuracy                           0.84    246928
   macro avg       0.65      0.68      0.66    246928
weighted avg       0.86      0.84      0.85    246928


f1 스코어 :  0.4202814729130519


In [18]:
#랜덤포레스트 - class_weight = balanced

forest = RandomForestClassifier(n_estimators = 100 , random_state = 0 , max_features = 2 , max_depth=45 , class_weight = 'balanced')
forest.fit(X_smo_t, y_smo_t)
pred1 = forest.predict(X_test)
print("훈련 : " , forest.score(X_smo_t, y_smo_t))
print("테스트 : " , forest.score(X_test , y_test))
print(classification_report(y_test , pred1))
print("\nf1 스코어 : " , f1_score(y_test , pred1))


훈련 :  0.9936479337752206
테스트 :  0.8528801917967991
              precision    recall  f1-score   support

           0       0.92      0.91      0.92    217173
           1       0.40      0.44      0.42     29755

    accuracy                           0.85    246928
   macro avg       0.66      0.67      0.67    246928
weighted avg       0.86      0.85      0.86    246928


f1 스코어 :  0.4163239074550128


In [20]:
#랜덤포레스트 - class_weight = balanced

forest = RandomForestClassifier(n_estimators = 100 , random_state = 0 , max_features = 1 , max_depth=45 , class_weight = 'balanced')
forest.fit(X_smo_t, y_smo_t)
pred1 = forest.predict(X_test)
print("훈련 : " , forest.score(X_smo_t, y_smo_t))
print("테스트 : " , forest.score(X_test , y_test))
print(classification_report(y_test , pred1))
print("\nf1 스코어 : " , f1_score(y_test , pred1))


훈련 :  0.9933686208849307
테스트 :  0.8523739713600725
              precision    recall  f1-score   support

           0       0.92      0.91      0.92    217173
           1       0.40      0.43      0.41     29755

    accuracy                           0.85    246928
   macro avg       0.66      0.67      0.67    246928
weighted avg       0.86      0.85      0.86    246928


f1 스코어 :  0.4148702226359974


In [21]:
#랜덤포레스트 - class_weight = balanced
#### 최종 선정 ####
forest = RandomForestClassifier(n_estimators = 100 , random_state = 0 , max_features = 3 , max_depth=45 , class_weight = 'balanced')
forest.fit(X_smo_t, y_smo_t)
pred1 = forest.predict(X_test)
print("훈련 : " , forest.score(X_smo_t, y_smo_t))
print("테스트 : " , forest.score(X_test , y_test))
print(classification_report(y_test , pred1))
print("\nf1 스코어 : " , f1_score(y_test , pred1))


훈련 :  0.9929933901064919
테스트 :  0.8520094926456295
              precision    recall  f1-score   support

           0       0.92      0.91      0.92    217173
           1       0.40      0.44      0.42     29755

    accuracy                           0.85    246928
   macro avg       0.66      0.67      0.67    246928
weighted avg       0.86      0.85      0.86    246928


f1 스코어 :  0.4152090767975164


In [79]:
# AFSNT_DLY 원핫인코딩한 X_AFSNT_DLY 읽기

DATA2 = pd.read_pickle("C:/Users/dong/Desktop/R/data/X_AFSNT_DLY.pkl")

# AFASNT_DLY 읽기
DATA3 = pd.read_csv(r"C:/Users/dong/Desktop/AFSNT_DLY_P.csv" , encoding = "CP949" , sep=",")

In [80]:
print(DATA2.shape)
print(DATA3.shape)

(16076, 117)
(16076, 13)


In [55]:
# AFSNT_DLY에 적용시키기

#랜덤포레스트 - class_weight = balanced
#### 최종 선정 ####
#forest = RandomForestClassifier(n_estimators = 100 , random_state = 0 , max_features = 3 , max_depth=45 , class_weight = 'balanced')
#forest.fit(X_smo_t, y_smo_t)
pred = forest.predict(DATA2)
dly_rate = forest.predict_proba(DATA2)
print("훈련 : " , forest.score(X_smo_t, y_smo_t))




훈련 :  0.9929933901064919


In [53]:
DATA2.shape

(16076, 117)

In [84]:
dly_rate[350]

array([0.73200968, 0.26799032])

In [71]:
type(DATA3.DLY_RATE)

pandas.core.series.Series

In [85]:
DATA3['DLY'] = forest.predict(DATA2)
DATA3['DLY_RATE'] = forest.predict_proba(DATA2)
print("훈련 : " , forest.score(X_smo_t, y_smo_t))


훈련 :  0.9929933901064919


In [None]:
dly_rate2 = forest.predict_proba[0](DATA2)

In [90]:
#데이터프레임명.to_csv("")
DLY = DATA3['DLY']
DLY.to_csv(r"C:/Users/dong/Desktop/DLY.csv")

In [91]:
DLY_RATE = DATA3['DLY_RATE']
DLY_RATE.to_csv(r"C:/Users/dong/Desktop/DLY_RATE.csv")