<a href="https://colab.research.google.com/github/ohilikeit/LG_Aimers_2023/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install optuna
! pip install catboost

In [72]:
import pandas as pd
%matplotlib inline 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score
import warnings
import random
import os

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


warnings.simplefilter(action='ignore', category=FutureWarning)
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)
os.chdir('/content/drive/MyDrive/ML_projects/LG_Aimers')
train = pd.read_csv('./train.csv').fillna(0)
test = pd.read_csv('./test.csv').fillna(0)
sample_submission = pd.read_csv('./sample_submission.csv')

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])

X = train.drop(['PRODUCT_ID', 'Y_Class', 'Y_Quality', 'TIMESTAMP'], axis=1)
y = train['Y_Class']
test = test.drop(['PRODUCT_ID', 'TIMESTAMP'], axis=1)

# scaling
scaler = RobustScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
test = pd.DataFrame(scaler.transform(test), columns = test.columns)

def preprocess(df):
    # 아예 비어있는 컬럼 제거
    for i in df.columns:
        if list(df[i].value_counts()) == []:
            df.drop(i, axis = 1, inplace=True)

    return df

X = preprocess(X)
test = preprocess(test)
train = pd.concat([X, y], axis=1)

In [None]:
from sklearn.decomposition import PCA
ex = X.iloc[:, :2]
only_X = X.iloc[:, 2:]
pca = PCA(n_components=2)
pca.fit(only_X)
X_pca = pca.transform(only_X)
print(X_pca.shape)
print(pca.explained_variance_ratio_)

df = pd.DataFrame(X_pca, columns = ['comp1', 'comp2'])
train = pd.concat([df, ex, y], axis=1)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
ex = X.iloc[:, :2]
only_X = X.iloc[:, 2:]

lda = LinearDiscriminantAnalysis()
lda.fit(only_X,y)
X_lda = lda.transform(only_X)


ex_2 = test.iloc[:, :2]
only_X_test = test.iloc[:, 2:]
test_lda = lda.transform(only_X_test)
test = pd.DataFrame(test_lda, columns = ['comp1', 'comp2'])
test = pd.concat([test, ex_2], axis=1)


df = pd.DataFrame(X_lda, columns = ['comp1', 'comp2'])
train = pd.concat([df, ex, y], axis=1)
X = train.drop('Y_Class', axis=1)
y = train['Y_Class']

In [None]:
# 학습함수 정의 
def objective(trial: Trial, train):
    param = {
        'verbose' : 0,
        'random_state': 7,
        'n_iter_no_change' : 50,
        'validation_fraction' : 0.1,
        'learning_rate' : 0.1,
        'n_estimators' : trial.suggest_int('n_estimators', 100, 2000, step=20),
        'min_impurity_decrease' : trial.suggest_float('min_impurity_decrease', 0, 0.2),
        'max_depth' : trial.suggest_int('max_depth', 2, 20),
        'max_features' : trial.suggest_categorical('max_features', ['auto', 'log2']),
        'subsample' : trial.suggest_float('subsample', 0.7, 1.0, step=0.1)
    }
    cv_accuracy = []
    cv = StratifiedKFold(n_splits = 5)
    n_iter = 0

    for t,v in cv.split(train, train['Y_Class']):
        train_cv = train.iloc[t]
        val_cv = train.iloc[v]

        X = train_cv.drop('Y_Class', axis=1)
        y = train_cv['Y_Class']

        val_X = val_cv.drop('Y_Class', axis=1)
        val_y = val_cv['Y_Class']

        model = GradientBoostingClassifier(**param)

        model.fit(X, y)
        score = f1_score(val_y, model.predict(val_X), average = 'macro')

        cv_accuracy.append(score)
        n_iter += 1

    return np.mean(cv_accuracy)

    # model = GradientBoostingClassifier(**param)
    # model.fit(X_train, y_train)
    # score = f1_score(y_val, model.predict(X_val), average='macro')

    # return score

# create study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.HyperbandPruner(),
    study_name='GB-Hyperparameter-Tuning'
)

# 학습 
study.optimize(lambda trial: objective(trial,train), 
               n_trials=50)
print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
# SVC(kernel = 'poly)로 stacking이나 활용하자 / split은 0.2 기준 0 나왔음. 
# 할때 범주형 변수 labelencoding 해놨으니 얘네는 빼고 돌려보고 더 좋은 걸로 하자 

# y_class 나뉘는걸 보면 굉장히 민감하게 나뉘는거같은데 pca와 같은 차원축소 기법이 과연 옳을 것인가는 의문
# 이건 해보고 성능 높아지면 쓰고 아니면 안쓰기 

# 공정 데이터이고 각 컬럼이 비식별화되어 함부로 판단하기 힘듦. 
# 결측치로 비어있는 부분은 아마 line이나 product code가 다름에 따라 해당 공정이 없는 경우일 것으로 예상됨.
# 이상치 처리는 건들지 말고 결측치는 0으로 두어 모델에 반영되지 않게 하자. 

# line, product 4-1 이랑 5-1 조합 train에선 3 3 이고 test에선 3 1 인데 이거 빼고 아니고 성능 차이 확인 

In [68]:
GB_model = GradientBoostingClassifier(**study.best_trial.params, random_state = 42, n_iter_no_change = 50, validation_fraction = 0.1, learning_rate = 0.05)
GB_model.fit(X, y)
GB_pred = GB_model.predict(test)
print(pd.DataFrame(GB_pred).value_counts())

sample_submission['Y_Class'] = GB_pred
sample_submission.to_csv('./submit.csv', index=False)

1    261
0     30
2     19
dtype: int64


## 1. XGB

In [None]:
# 학습함수 정의 
def objective(trial: Trial, train):
    param = {
        'criterion' : 'absolute_error',
        'tree_method' : 'exact',
        'seed': 42, 
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators':trial.suggest_int('n_estimators', 100, 1000),        
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 9),
        'max_depth' : trial.suggest_int('max_depth', 3, 30),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'min_split_loss' : trial.suggest_loguniform('min_split_loss', 1e-3, 10.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
    }
    cv_accuracy = []
    cv = StratifiedKFold(n_splits = 5)
    n_iter = 0

    for t,v in cv.split(train, train['Y_Class']):
        train_cv = train.iloc[t]
        val_cv = train.iloc[v]

        X = train_cv.drop('Y_Class', axis=1)
        y = train_cv['Y_Class']

        val_X = val_cv.drop('Y_Class', axis=1)
        val_y = val_cv['Y_Class']

        model = XGBClassifier(**param)

        model.fit(X, y)
        score = f1_score(val_y, model.predict(val_X), average = 'macro')

        cv_accuracy.append(score)
        n_iter += 1

    return np.mean(cv_accuracy)


# create study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.HyperbandPruner(),
    study_name='XGBClassifier-Hyperparameter-Tuning'
)

# 학습 
study.optimize(lambda trial: objective(trial,train), 
               n_trials=50)
print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

## 2. LGBM

In [None]:
# 학습함수 정의 
def objective(trial: Trial, train):

    param = {
        'verbose' : -1,
        'random_state': 7, 
        'n_jobs': -1,
        # 'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        # 'max_depth' : trial.suggest_int('max_depth', 2, 30),
        # 'n_estimators' : trial.suggest_int('n_estimators', 500, 3000),
        # 'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.3), 
        'learning_rate' : 0.1,
        # 'num_leaves' : trial.suggest_int('num_leaves', 5, 30),
        # 'min_child_samples' : trial.suggest_int('min_child_samples', 2, 10),
        # 'reg_alpha' : trial.suggest_float('reg_alpha', 1, 1.3),
        # 'reg_lambda' : trial.suggest_float('reg_lambda', 1, 1.3)
    }
    cv_accuracy = []
    cv = StratifiedKFold(n_splits = 5)
    n_iter = 0

    for t,v in cv.split(train, train['Y_Class']):
        train_cv = train.iloc[t]
        val_cv = train.iloc[v]

        X = train_cv.drop('Y_Class', axis=1)
        y = train_cv['Y_Class']

        val_X = val_cv.drop('Y_Class', axis=1)
        val_y = val_cv['Y_Class']

        model = LGBMClassifier(**param)

        model.fit(X, y)
        score = f1_score(val_y, model.predict(val_X), average = 'macro')
        print(score)

        cv_accuracy.append(score)
        n_iter += 1

    return np.mean(cv_accuracy)

# create study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.HyperbandPruner(),
    study_name='LGBM-Hyperparameter-Tuning'
)

# 학습 
study.optimize(lambda trial: objective(trial,train),
               n_trials=100)
print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, stratify = y, random_state = 7)

RF = RandomForestClassifier(random_state=7).fit(X_train, y_train)
RF_pred = RF.predict(X_val)
score = f1_score(y_val, RF_pred, average='macro')
print(score)

In [None]:
model = LGBMClassifier(random_state=7).fit(X_train, y_train)
pred = model.predict(X_val)
score = f1_score(y_val, pred, average='macro')
print(score)

0.5929319467250501


In [None]:
model = XGBClassifier(random_state=7).fit(X_train, y_train)
pred = model.predict(X_val)
score = f1_score(y_val, pred, average='macro')
print(score)

0.5925925925925926


In [None]:
model = ExtraTreesClassifier(random_state=7).fit(X_train, y_train)
pred = model.predict(X_val)
score = f1_score(y_val, pred, average='macro')
print(score)

0.6415491578535056


In [None]:
model = GradientBoostingClassifier(random_state=7).fit(X_train, y_train)
pred = model.predict(X_val)
score = f1_score(y_val, pred, average='macro')
print(score)

0.6888888888888888


In [None]:
RF = RandomForestClassifier(random_state=43).fit(X, y)
RF_pred = RF.predict(test)

XGB = XGBClassifier(random_state=43).fit(X, y)
XGB_pred = XGB.predict(test)

LGBM = LGBMClassifier(random_state=43).fit(X, y)
LGBM_pred = LGBM.predict(test)

ET = ExtraTreesClassifier(random_state=43).fit(X, y)
ET_pred = ET.predict(test)

GB = GradientBoostingClassifier(random_state=43).fit(X, y)
GB_pred = GB.predict(test)

# 2랑 5가 멀쩡 

In [None]:
# 5개 다 쓰는 hard voting
# https://wikidocs.net/42408

def mode(list):
    count = 0
    mode = 0;
    for x in list: 
        if list.count(x) > count:
            count = list.count(x)
            mode = x

    return mode
final_pred = []
for i in range(0, len(RF_pred)):
    final_pred.append(mode([RF_pred[i], XGB_pred[i], LGBM_pred[i], ET_pred[i], GB_pred[i]]))
sample_submission['Y_Class'] = final_pred
sample_submission['Y_Class'].value_counts()

1    268
0     40
2      2
Name: Y_Class, dtype: int64

In [None]:
def mode(list):
    count = 0
    mode = 0;
    for x in list: 
        if list.count(x) > count:
            count = list.count(x)
            mode = x
    return mode
final_pred = []
for i in range(0, len(RF_pred)):
    final_pred.append(mode([GB_pred[i]]))
sample_submission['Y_Class'] = final_pred
sample_submission['Y_Class'].value_counts()

1    242
0     39
2     29
Name: Y_Class, dtype: int64

In [None]:
sample_submission.to_csv('./submit.csv', index=False)

## 3. ET

In [None]:
# 학습함수 정의 
def objective(trial: Trial, train):
    param = {
        'verbose' : 1,
        'random_state': 42,
        'n_jobs': -1,
        'max_depth' : trial.suggest_int('max_depth', 5, 50),
        'ccp_alpha' : trial.suggest_float('ccp_alpha', 0, 0.3),
        'n_estimators':trial.suggest_int('n_estimators', 500, 3000),
        'min_impurity_decrease' : trial.suggest_float('min_impurity_decrease', 0, 0.5)
    }
    cv_accuracy = []
    cv = StratifiedKFold(n_splits = 5)
    n_iter = 0

    for t,v in cv.split(train, train['Y_Class']):
        train_cv = train.iloc[t]
        val_cv = train.iloc[v]

        X = train_cv.drop('Y_Class', axis=1)
        y = train_cv['Y_Class']

        val_X = val_cv.drop('Y_Class', axis=1)
        val_y = val_cv['Y_Class']

        model = ExtraTreesClassifier(**param)

        model.fit(X, y)
        score = f1_score(val_y, model.predict(val_X), average = 'macro')

        cv_accuracy.append(score)
        n_iter += 1

    return np.mean(cv_accuracy)


# create study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.HyperbandPruner(),
    study_name='ExtraTrees-Hyperparameter-Tuning'
)

# 학습 
study.optimize(lambda trial: objective(trial,train), 
               n_trials=50)
print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

## 4. GB

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 7)

# 학습함수 정의 
def objective(trial: Trial, train):
    param = {
        'verbose' : 0,
        'random_state': 7,
        'n_iter_no_change' : 50,
        'validation_fraction' : 0.1,
        'learning_rate' : 0.1,
        'n_estimators' : trial.suggest_int('n_estimators', 100, 2000, step=20),
        'min_impurity_decrease' : trial.suggest_float('min_impurity_decrease', 0, 0.2),
        'max_depth' : trial.suggest_int('max_depth', 2, 20),
        'max_features' : trial.suggest_categorical('max_features', ['auto', 'log2']),
        'subsample' : trial.suggest_float('subsample', 0.7, 1.0, step=0.1)
    }
    cv_accuracy = []
    cv = KFold(n_splits = 5)
    n_iter = 0

    for t,v in cv.split(train, train['Y_Class']):
        train_cv = train.iloc[t]
        val_cv = train.iloc[v]

        X = train_cv.drop('Y_Class', axis=1)
        y = train_cv['Y_Class']

        val_X = val_cv.drop('Y_Class', axis=1)
        val_y = val_cv['Y_Class']

        model = GradientBoostingClassifier(**param)

        model.fit(X, y)
        score = f1_score(val_y, model.predict(val_X), average = 'macro')

        cv_accuracy.append(score)
        n_iter += 1

    return np.mean(cv_accuracy)

    # model = GradientBoostingClassifier(**param)
    # model.fit(X_train, y_train)
    # score = f1_score(y_val, model.predict(X_val), average='macro')

    # return score

# create study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.HyperbandPruner(),
    study_name='GB-Hyperparameter-Tuning'
)

# 학습 
study.optimize(lambda trial: objective(trial,train), 
               n_trials=50)
print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
GB_model = GradientBoostingClassifier(random_state = 42, n_iter_no_change = 50, validation_fraction = 0.1, n_estimators = 1900, 
                                      learning_rate = 0.05, min_impurity_decrease = 0.07031308605404483, max_depth = 2)
GB_model.fit(X, y)
GB_pred = GB_model.predict(test)
pd.DataFrame(GB_pred).value_counts()

1    242
2     39
0     29
dtype: int64

In [9]:
GB_model = GradientBoostingClassifier(**study.best_trial.params, random_state = 42, n_iter_no_change = 50, validation_fraction = 0.1, learning_rate = 0.05)
GB_model.fit(X, y)
GB_pred = GB_model.predict(test)
print(pd.DataFrame(GB_pred).value_counts())

sample_submission['Y_Class'] = GB_pred
sample_submission.to_csv('./submit.csv', index=False)

1    241
2     39
0     30
dtype: int64
