## 1. 회귀

In [None]:
!pip install catboost
!pip install category_encoders

from catboost import CatBoostRegressor, Pool

In [None]:
CB = CatBoostRegressor(depth=4,bagging_temperature=2.099,learning_rate=0.02091,subsample=0.2325)
CB.fit(train_x, train_y)
CB_pred = CB.predict(test_x)

In [None]:
import joblib
joblib.dump(CB, '폰번호_1.h5')

In [None]:
model= joblib.load('폰번호_1.h5')

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test['label'] = CB_pred
test

In [None]:
test['label']= test['label'].replace([0,1],['ham','spam'])

In [None]:
test.to_csv("폰번호_1.csv",index=False)

In [None]:
# 정답
폰번호_1.h5
폰번호_1.csv
폰번호_1.ipynb

## 2.텍스트 다중분류

In [None]:
!pip install rich

In [None]:
import pandas as pd
import numpy as np
import os
import re
import rich  # 출력을 예쁘게 꾸며주는 라이브러리
from rich.table import Table

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from tqdm.auto import tqdm

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
train_df = pd.read_csv('spam.csv')
test_df = pd.read_csv('spam_test_text.csv')

In [None]:
train_df['label'] = train_df['label'].replace(['ham','spam'],[0,1])

In [None]:
def get_pipe(model, model_name: str) -> Pipeline:
    "TfidfVectorizer와 모델을 연결한 파이프라인을 반환하는 함수"
    tfidf = TfidfVectorizer(analyzer="char", ngram_range=(1, 3))
    pipe = Pipeline([
        ("tfidf", tfidf),
        (model_name, model)
    ])
    return pipe

In [None]:
def return_kfold_accuarcy(model, k: int = 5) -> float:
    "모델을 입력받아 KFold 예측 후 accuracy score를 반환하는 함수"
    kfold = StratifiedKFold(k, shuffle=True, random_state=42)
    result = []
    for train_idx, test_idx in kfold.split(train_df["text"], train_df["label"]):
        train, val = train_df.iloc[train_idx], train_df.iloc[test_idx]
        model.fit(train["text"], train["label"])
        pred = model.predict(val["text"])
        acc = accuracy_score(val["label"], pred)
        result.append(acc)

    return np.mean(result)

In [None]:
models = [
    ("naive_bayes", BernoulliNB()),
    ("SGD", SGDClassifier(random_state=42, n_jobs=-1)),
]

model_pipes = [(name, get_pipe(model, name)) for name, model in models]

In [None]:
# models = [
#     ("naive_bayes", BernoulliNB()),
#     ("SGD", SGDClassifier(random_state=42, n_jobs=-1)),
#     ("rfc", RandomForestClassifier(random_state=42, n_jobs=-1)),
#     ("SVC", SVC(random_state=42)),
#     ("ada", AdaBoostClassifier(random_state=42)),
#     ("lgbm", LGBMClassifier(random_state=42)),
#     ("lgbm2", LGBMClassifier(n_estimators=80, random_state=42)),
#     ("xgb", XGBClassifier(random_state=42)),
#     ("knc1", KNeighborsClassifier()),
#     ("knc2", KNeighborsClassifier(n_neighbors=4))
# ]

# model_pipes = [(name, get_pipe(model, name)) for name, model in models]

In [None]:
table = Table(title="Model Comparison Table")
table.add_column("Model Name", justify="left", style="green")
table.add_column("Accuracy", justify="right")

for model_name, model in tqdm(model_pipes, leave=False):
    acc = return_kfold_accuarcy(model)
    table.add_row(model_name, f"{acc:0.3f}")

rich.print(table)

In [None]:
from sklearn.ensemble import StackingClassifier

stack_models = [(name, get_pipe(model, name)) for name, model in models]

stacking = StackingClassifier(stack_models)
acc = return_kfold_accuarcy(stacking)
rich.print(acc)

In [None]:
stacking.fit(train_df['text'], train_df['label'])
submission_pred = stacking.predict(test_df['text'])

In [None]:
test = pd.read_csv('spam_submission.csv')

In [None]:
test['label'] = submission_pred
test

In [None]:
test['label']= test['label'].replace([0,1],['ham','spam'])

In [None]:
test.to_csv("폰번호_2.csv",index=False)

In [None]:
import joblib
joblib.dump(stacking, '폰번호_2.h5')

In [None]:
model= joblib.load('폰번호_2.h5')

In [None]:
# 정답
폰번호_2.h5
폰번호_2.csv
폰번호_2.ipynb

## 3. 이미지 이진분류

In [None]:
import pandas as pd
import numpy as np
import keras
import glob
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
#from mlxtend.preprocessing import minmax_scaling
from sklearn.metrics import roc_curve, auc

from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, GlobalAveragePooling2D, Input, BatchNormalization, Multiply, Activation
# from keras.optimizers import RMSprop, SGD
from keras.regularizers import l2
from keras.preprocessing.image import ImageDataGenerator
# from keras.utils import plot_model
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix
from keras import backend as K
from tensorflow.keras.backend import clear_session

import os

In [None]:
from keras.preprocessing.image import ImageDataGenerator
# 모든 전처리 사용
train_datagen = ImageDataGenerator(
    rescale = 1. / 255,
    validation_split = 0.2,
    horizontal_flip=True,
    vertical_flip=True,
    rotation_range=180,
    brightness_range=(0.2, 0.8)



)
batch_size = 5
img_height = 960
img_width = 720



# train_genrator 생성
train_generator = train_datagen.flow_from_directory(
    '03_clean_desk/train',
    batch_size = batch_size,
    target_size = (img_height, img_width),
    class_mode = 'categorical',
    subset='training',



)



# validation_generator 생성
validation_generator = train_datagen.flow_from_directory(
    '03_clean_desk/train',
    batch_size = batch_size,
    target_size = (img_height, img_width),
    class_mode = 'categorical',
    subset='validation',  
)



# test_generator 생성



test_generator = train_datagen.flow_from_directory(
    '03_clean_desk/test',
    batch_size = batch_size,
    target_size = (img_height, img_width),
    class_mode = 'categorical',
    color_mode="grayscale",
    )

In [None]:
#1. 세션 클리어
clear_session()
#2. 모델 발판 생성
model = Sequential()
#3. 레이어 블록 조립
model.add(Conv2D(filters = 32, kernel_size = (5,5), strides = 2, padding = 'Same', activation ='relu', input_shape = (960,720,3), kernel_initializer='he_normal')) #he_normal 균일 분산 스케일링 이니셜라이저
model.add(Conv2D(filters = 32, kernel_size = (5,5), strides = 2, padding = 'Same', activation ='relu',kernel_initializer='he_normal'))
model.add(MaxPool2D(pool_size=(2,2))) #same은 입력을 고르게 패딩
model.add(Dropout(0.2))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', activation ='relu',kernel_initializer='he_normal'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', activation ='relu',kernel_initializer='he_normal'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.2))
model.add(Conv2D(filters = 128, kernel_size = (2,2),padding = 'Same', activation ='relu',kernel_initializer='he_normal'))
model.add(Conv2D(filters = 128, kernel_size = (2,2),padding = 'Same', activation ='relu',kernel_initializer='he_normal'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.2))
model.add(Conv2D(filters = 256, kernel_size = (2,2),padding = 'Same', activation ='relu',kernel_initializer='he_normal'))
model.add(Conv2D(filters = 256, kernel_size = (2,2),padding = 'Same', activation ='relu',kernel_initializer='he_normal'))
model.add(GlobalAveragePooling2D())
model.add(Dense(512, activation = "relu",kernel_initializer='he_normal'))
model.add(Dropout(0.2))
model.add(Dense(2, activation = "softmax",kernel_initializer='he_normal',kernel_regularizer=l2()))
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# early_stopping 
cp = ModelCheckpoint('my_ck.h5', monitor='val_loss', verbose=1, save_weights_only=True, save_best_only=True)

# early_stopping
es = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, restore_best_weights=True)

# 모델 학습
history = model.fit(train_generator,
                    steps_per_epoch=20,
                    epochs=200,
                    validation_data=validation_generator,
                    validation_steps=10,
                    callbacks=[cp,es], 
                    batch_size=10,)

In [None]:
from keras.models import load_model

model.save('폰번호_3.h5')

In [None]:
m2 = load_model('폰번호_3.h5')

In [None]:
# 예측 & 검증
pred = model.predict(x_val2)

p1 = pred.argmax(axis=1)

cn = np.array(class_names)

print(accuracy_score(y_val,p1))
print('-'*60)
print(confusion_matrix(y_val, p1))
print('-'*60)
print(classification_report(cn[y_val], cn[p1]))

In [None]:
test = pd.read_csv('spam_submission.csv')

In [None]:
test['label'] = pred
test

In [None]:
test.to_csv("폰번호_3.csv",index=False)

In [None]:
pd.read_csv("폰번호_3.csv")

In [None]:
# 정답
폰번호_3.h5
폰번호_3.csv
폰번호_3.ipynb