In [None]:
import pandas as pd

In [None]:
# 파일 불러오기
df = pd.read_csv('dataset_final.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

NameError: ignored

In [None]:
print('dataset shape: ', df.shape)
print(df.info())

In [None]:
print(df['활성화_여부'].value_counts())

2    841
1    301
0     99
Name: 활성화_여부, dtype: int64


식별자 컬럼 제거

In [None]:
df.drop(['행정동_코드', '행정동명', '상권_구분_코드', '상권_코드', '상권_코드_명'], axis=1, inplace=True)

분석에 사용할 속성 선택

In [None]:
columns = ['분기당_매출_금액', '분기당_매출_건수', '환산임대료(원, 21년도)', '점포수', '단위면적당 점포 수(k㎡)', '프랜차이즈_점포_수', '개인사업자_수', '2021년_2030여성_비율평균', 
           '2021년_외국인_평균', '문화시설수', '지하철_역_수', '버스_정거장_수', '전체_카페_점포_수', '개인카페_점포수']

클래스 데이터셋과 피처 데이터셋 분리

In [None]:
X_features = df.iloc[:,:-1]
y_labels = df.iloc[:,-1]
print('피처 데이터 shape:{0}'.format(X_features.shape))

피처 데이터 shape:(1241, 21)


피처 변수 전처리(정규화)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X_features)
X_scaled = scaler.transform(X_features)

In [None]:
X_features = pd.DataFrame(data=X_scaled, columns=X_features.columns)

학습 데이터셋과 테스트 데이터셋 분리

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels,
                                                   test_size=0.3, random_state=101)

train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape, X_test.shape))

print(' 학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt)
print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt)

학습 세트 Shape:(868, 21), 테스트 세트 Shape:(373, 21)
 학습 세트 레이블 값 분포 비율
2    0.694700
1    0.236175
0    0.069124
Name: 활성화_여부, dtype: float64

 테스트 세트 레이블 값 분포 비율
2    0.638070
1    0.257373
0    0.104558
Name: 활성화_여부, dtype: float64


## SVM

SVM 모형 학습

In [None]:
from sklearn import svm
from sklearn.metrics import f1_score

In [None]:
clf = svm.SVC(decision_function_shape='ovo')

In [None]:
clf.fit(X_train, y_train)

SVC(decision_function_shape='ovo')

In [None]:
pred = clf.predict(X_test)

In [None]:
f1 = f1_score(y_test, pred, average='micro')
print(f1)

0.7158176943699732


# LightGBM

LightGBM 모델 학습 및 f1-score 확인

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [None]:
lgbm_clf = LGBMClassifier(n_estimators=300, objective='multiclass')

In [None]:
evals = [(X_test,y_test)]

lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, 
             eval_set=evals, verbose=True)

In [None]:
pred = lgbm_clf.predict(X_test)

In [None]:
f1 = f1_score(y_test, pred, average='micro')
print(f1)

0.737265415549598


하이퍼 파라미터 튜닝

In [None]:
lgbm_clf = LGBMClassifier(n_estimators=300, objective='multiclass')

In [None]:
params = {'num_leaves': [32, 64, 128], 
          'max_depth': [32, 64, 128],
          'min_child_samples': [60, 80, 100],
          'subsample': [0.5, 0.7, 0.9]}


In [None]:
gridcv = GridSearchCV(lgbm_clf, param_grid=params)
gridcv.fit(X_train, y_train, early_stopping_rounds=30,
           eval_set=[(X_train, y_train), (X_test, y_test)])

print('GridSearchCV 최적 파라미터:', gridcv.best_params_)

GridSearchCV 최적 파라미터: {'max_depth': 32, 'min_child_samples': 80, 'num_leaves': 32, 'subsample': 0.5}

In [None]:
lgbm_clf = LGBMClassifier(n_estimators=100, objective='multiclass', max_depth=32, min_child_samples=80, num_leaves=32, subsample=0.5)

In [None]:
evals = [(X_test, y_test)]

lgbm_clf.fit(X_train, y_train, early_stopping_rounds=50, eval_set=evals, verbose=True)

In [None]:
pred = lgbm_clf.predict(X_test)

검증

In [None]:
f1 = f1_score(y_test, pred, average='micro')
print(f1)

0.737265415549598


In [None]:
confusion_matrix(y_test, pred)

array([[ 35,   1,   3],
       [  4,  23,  69],
       [  2,  19, 217]])