# IMPORT

In [None]:
!pip install ujson

Collecting ujson
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ujson
Successfully installed ujson-5.10.0


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import json

import scipy as sp
import pandas as pd
import numpy as np

from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from collections import Counter

import lightgbm as lgb
np.random.seed(369) # 랜덤성 고정

## Model

In [None]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics

# 평가자 a, b의 등급을 기반으로 혼동 행렬 생성
# QWK는 실제 평가자와 예측 펴아갖의 등급 간 불일치를 거리 기반으로 측정하는 지표
# 이때 얼마나 불일치했는지를 계산하려면 '얼마나, 어떤 등급 간에 서로 헷갈렸는지'를 알아야 함
# 따라서, 혼동행렬 필요
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

# 각 등급별로 몇 번 나왔는지 세기
def histogram(ratings, min_rating=None, max_rating=None):
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

# QWK(Quadratic Weighted Kappa) 점수 계산
# 두 명의 평가자(혹은 예측값과 실제값)가 매긴 정수 등급 간의 일치도를 측정
# 범위: -1(완벽 불일치) / 0(우연한 일치 수준) / 1(완벽히 일치)
def quadratic_weighted_kappa(y, y_pred):
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

**QWK 사용 이유**
- `AdoptionSpeed`는 0, 1, 2, 3, 4 같은 정수형 등급이지만, 단순한 범주형(classification)이 아니라 **순서가 있는 클래스(ordinal class)**
- 0 vs 4의 차이와 2 vs 3의 차이는 다르게 평가되어야 함
--> **등급 간 거리까지 반영해서 평가하는 QWK** 필요

| 비교 기준                              | 설명                                    |
| ---------------------------------- | ------------------------------------- |
| **Accuracy, RMSE**                 | 단순하게 맞췄는지 틀렸는지만 봄                     |
| **Quadratic Weighted Kappa (QWK)** | 틀렸더라도 **얼마나 틀렸는지**를 중요하게 평가함 (거리 기반!) |


In [None]:
# OptimizedRounder 클래스
# 회귀 모델의 연속적인 예측값을 정수형 등급(class)로 바꾸기 위한 클래스
# QWK 점수를 최대화하기 위해 최적의 경계값(coef)를 찾아주는 게 핵심 목적
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

**OptimaziedRounder 클래스의 흐름**
1. 모델은 회귀 방식으로 예측하는데, 실제 targetd은 정수
2. `OptimizedRounder.fit`을 통해 QWK가 최대가 되는 cut-off를 찾음
3. `predict`로 등급 분류

| 기능               | 설명                         |
| ---------------- | -------------------------- |
| `fit()`          | QWK가 최대가 되도록 cut-off 값 최적화 |
| `predict()`      | 예측값을 등급으로 바꿈               |
| `coefficients()` | 찾은 cut-off 값 반환            |


In [None]:
# RMSE(Root Mean Square Error)
# 오차의 평균적인 크기를 원래 단위로 표현, 값이 작을수록 모델의 예측이 실제에 더 가깝다는 의미
# 모델 예측 성능을 평가할 때 자주 쓰이는 오차 지표
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

# DATA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
breeds = pd.read_csv('/content/drive/MyDrive/dartb/2025_S/petfinder-adoption-prediction/breed_labels.csv')
colors = pd.read_csv('/content/drive/MyDrive/dartb/2025_S/petfinder-adoption-prediction/color_labels.csv')
states = pd.read_csv('/content/drive/MyDrive/dartb/2025_S/petfinder-adoption-prediction/state_labels.csv')

In [None]:
train = pd.read_csv('/content/drive/MyDrive/dartb/2025_S/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('/content/drive/MyDrive/dartb/2025_S/petfinder-adoption-prediction/test/test.csv')

In [None]:
print(train.shape, test.shape)

(14993, 24) (3972, 23)


## 타겟변수 분리 및 불필요한 컬럼 제거

In [None]:
target = train['AdoptionSpeed'] # 타겟변수 저장
train_id = train['PetID']
test_id = test['PetID']

# 모델 학습에서는 입력 x와 출력 y를 분리해야 함
# 따라서, 타겟변수인 'Adoptionspeed와 분석에 필요없는 id를 제거
train.drop(['AdoptionSpeed', 'PetID'], axis=1, inplace=True)
test.drop(['PetID'], axis=1, inplace=True)

Google Cloud Natural Language API를 통해 미리 분석된 감성 분석(sentiment analysis) 결과를, 각 반려동물의 PetID를 기준으로 train과 test 데이터에 **새로운 피처(feature)**로 추가

## 외부 데이터 병합

In [None]:
doc_sent_mag = []
doc_sent_score = []
nf_count = 0
for pet in train_id:
    try:
        with open('../input/train_sentiment/' + pet + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except FileNotFoundError:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)

train.loc[:, 'doc_sent_mag'] = doc_sent_mag
train.loc[:, 'doc_sent_score'] = doc_sent_score

doc_sent_mag = []
doc_sent_score = []
nf_count = 0
for pet in test_id:
    try:
        with open('../input/test_sentiment/' + pet + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except FileNotFoundError:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)

test.loc[:, 'doc_sent_mag'] = doc_sent_mag
test.loc[:,'doc_sent_score'] = doc_sent_score

코드 기능 요약
1. `train_sentiment`와 `test_sentiment/` 폴더에 있는 .json 파일들을 읽는다. 각 파일 이름은 PetID.json 형식, 내용은 감성 분석 결과 (score, magnitude)
2. 파일이 존재하면: `documentSentiment` 값 중 `magnitude`와 `score`를 리스트에 저장
3. 파일이 없으면: 감성 분석 결과가 없는 경우를 위해 -1을 넣는다 (결측값 처리의 일환)
4. 마지막으로, `train`과 `test` 데이터프레임에 각각 새로운 컬럼 추가
→`doc_sent_mag`, `doc_sent_score`

- `doc_sent_mag`: 감정의 강도
- `doc_sent_score`: 감정의 극성(-1: 부정, 0: 중립, 1: 긍정)

## 텍스트 데이터 처리

train과 test 데이터의 Description(반려동물 설명 텍스트)을 TF-IDF + SVD(차원축소) 기법을 이용해 **수치형 피처로 변환하는 전처리 과정** 실시

-> **텍스트 데이터를 모델이 학습할 수 있는 형태의 벡터로 변환**하는 중요한 단계

In [None]:
# 결측값 처리 및 텍스트 준비
train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

tfv = TfidfVectorizer(min_df=2,  max_features=None,
        strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
        ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True,
        )

# TF-IDF 벡터화
# TfidfVectorizer를 사용해 텍스트를 고차원 sparse vector로 변환
# ngram_range=(1,3) → uni-, bi-, tri-gram 단어 패턴까지 고려
tfv.fit(list(train_desc))
X =  tfv.transform(train_desc)
X_test = tfv.transform(test_desc)
print("X (tfidf):", X.shape)

# SVD(TruncatedSVD)를 이용한 차원 축소
# 200차원으로 줄이므로 모델 학습 속도 향상 + 과적합 방지 효과
svd = TruncatedSVD(n_components=200)
svd.fit(X)
# print(svd.explained_variance_ratio_.sum())
# print(svd.explained_variance_ratio_)
X = svd.transform(X)
print("X (svd):", X.shape)

# X = pd.DataFrame(X, columns=['svd_{}'.format(i) for i in range(120)])
# train = pd.concat((train, X), axis=1)
# X_test = svd.transform(X_test)
# X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(120)])
# test = pd.concat((test, X_test), axis=1)

print("train:", train.shape)

X (tfidf): (14993, 181602)
X (svd): (14993, 200)
train: (14993, 24)


TfidfVectorizer를 선언할 때 bool 타입으로 파라미터를 넘겨야함

텍스트 형태인 Description 컬럼을 모델이 사용할 수 있는 수치 피처로 바꾸기 위해 TF-IDF → SVD 방식으로 인코딩하는 것!

In [None]:
## WITHOUT ERROR FIXED
train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True,
        stop_words = 'english')

# Fit TFIDF
tfv.fit(list(train_desc))
X =  tfv.transform(train_desc)
X_test = tfv.transform(test_desc)
print("X (tfidf):", X.shape)

svd = TruncatedSVD(n_components=120)
svd.fit(X)
# print(svd.explained_variance_ratio_.sum())
# print(svd.explained_variance_ratio_)
X = svd.transform(X)
print("X (svd):", X.shape)

X = pd.DataFrame(X, columns=['svd_{}'.format(i) for i in range(120)])
train = pd.concat((train, X), axis=1)
X_test = svd.transform(X_test)
X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(120)])
test = pd.concat((test, X_test), axis=1)

print("train:", train.shape)

X (tfidf): (14993, 10000)
X (svd): (14993, 120)
train: (14993, 144)


**수정 전 코드와 후 코드 차이**

| 항목              | 이전         | 현재          | 의미                                   |
| --------------- | ---------- | ----------- | ------------------------------------ |
| `min_df`        | 2          | 3           | 최소 3개 이상 문서에 등장한 단어만 사용 (더 많은 잡음 제거) |
| `max_features`  | 없음 (전부 사용) | 10,000      | 상위 10,000개의 중요 단어만 사용 (차원 축소)        |
| `stop_words`    | 없음         | `'english'` | "the", "and", "is" 같은 불용어 제거         |
| `token_pattern` | `\b\w+\b`  | `\w{1,}`    | 약간 더 넓은 범위의 단어 포함                    |


PetFinder 대회에서 제공하는 이미지의 Metadata JSON 파일을 불러와서, 거기서 유용한 **이미지 기반 피처(image features)**를 추출해 train과 test 데이터프레임에 추가하는 작업

In [None]:
vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0

# google cloud vision api로 추출된 이미지 정보
# 이미지 메타데이터 파일이 없는 경우, labelannotations이 없는 경우도 -1 or 'nothing'으로 채워 넘김
for pet in train_id:
    try:
        with open('../input/train_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

print(nf_count)
print(nl_count)
train.loc[:, 'vertex_x'] = vertex_xs
train.loc[:, 'vertex_y'] = vertex_ys
train.loc[:, 'bounding_confidence'] = bounding_confidences
train.loc[:, 'bounding_importance'] = bounding_importance_fracs
train.loc[:, 'dominant_blue'] = dominant_blues
train.loc[:, 'dominant_green'] = dominant_greens
train.loc[:, 'dominant_red'] = dominant_reds
train.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
train.loc[:, 'dominant_score'] = dominant_scores
train.loc[:, 'label_description'] = label_descriptions
train.loc[:, 'label_score'] = label_scores


vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in test_id:
    try:
        with open('../input/test_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

print(nf_count)
test.loc[:, 'vertex_x'] = vertex_xs
test.loc[:, 'vertex_y'] = vertex_ys
test.loc[:, 'bounding_confidence'] = bounding_confidences
test.loc[:, 'bounding_importance'] = bounding_importance_fracs
test.loc[:, 'dominant_blue'] = dominant_blues
test.loc[:, 'dominant_green'] = dominant_greens
test.loc[:, 'dominant_red'] = dominant_reds
test.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
test.loc[:, 'dominant_score'] = dominant_scores
test.loc[:, 'label_description'] = label_descriptions
test.loc[:, 'label_score'] = label_scores

14993
0
3972


## Features 정리

In [None]:
train.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)
test.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)

- `Name`, `RescerID는 고유값이라 제외, `Descriptio`은 텍스트 기반 피처로 변환해서 새 컬럼으로 넣었기 때문에 중복되어서 제외

## 수치형/범주형 변수 분리 및 변환

모델 학습을 위한 데이터 타입 정리 단계

In [None]:
# 수치형 변수 목록 정의
numeric_cols = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'AdoptionSpeed', 'doc_sent_mag', 'doc_sent_score', 'dominant_score', 'dominant_pixel_frac', 'dominant_red', 'dominant_green', 'dominant_blue', 'bounding_importance', 'bounding_confidence', 'vertex_x', 'vertex_y', 'label_score'] + ['svd_{}'.format(i) for i in range(120)]

# 범주형 변수 자동 추출
cat_cols = list(set(train.columns) - set(numeric_cols))

# 범주형 변수의 dtype을c 'category'로 설정
train.loc[:, cat_cols] = train[cat_cols].astype('category')
test.loc[:, cat_cols] = test[cat_cols].astype('category')

print(train.shape)
print(test.shape)
train.head()

(14993, 152)
(3972, 152)


Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,vertex_y,bounding_confidence,bounding_importance,dominant_blue,dominant_green,dominant_red,dominant_pixel_frac,dominant_score,label_description,label_score
0,2,3,299,0,1,1,7,0,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,nothing,-1
1,2,1,265,0,1,1,2,0,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,nothing,-1
2,1,1,307,0,1,2,7,0,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,nothing,-1
3,1,4,307,0,2,1,2,0,2,1,...,-1,-1,-1,-1,-1,-1,-1,-1,nothing,-1
4,1,1,307,0,1,1,0,0,2,1,...,-1,-1,-1,-1,-1,-1,-1,-1,nothing,-1


LightGBM에서 범주형 변수를 인식시키기 위해, 범주형 변수의 인덱스 목록을 추출

In [None]:
# get the categorical features
foo = train.dtypes
cat_feature_names = foo[foo == "category"]
cat_features = [train.columns.get_loc(c) for c in train.columns if c in cat_feature_names]

In [None]:
# 문자형 변수들을 숫자형으로 인코딩
from sklearn.preprocessing import LabelEncoder

for col in train.columns:
    if train[col].dtype == 'object':
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))


## 모델 정의(LightGBM + OptimizedRounder)

LightGBM 모델을 학습하고, 교차 검증을 통해 성능 평가하며 최종 예측 결과를 얻기 위한 전체 파이프 라인

In [None]:
params = {
    'application': 'regression',
    'boosting': 'gbdt',
    'metric': 'rmse',
    'num_leaves': 70,
    'max_depth': 8,
    'learning_rate': 0.01,
    'bagging_fraction': 0.85,
    'feature_fraction': 0.8,
    'min_split_gain': 0.02,
    'min_child_samples': 150,
    'min_child_weight': 0.02,
    'lambda_l2': 0.0475,
    'verbosity': -1,
    'data_random_seed': 17,
    'early_stop': 100,
    'num_rounds': 10000,
    'verbose_eval': 100
}

# 단일 Fold에서 모델 학습&예측
def runLGB(train_X, train_y, test_X, test_y, test_X2, params):
    print('Prep LGB')
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(test_X, label=test_y)

    print('Train LGB')
    params = params.copy()  # 하이퍼파라미터 설정
    num_rounds = params.pop('num_rounds')
    early_stop = params.pop('early_stop', None). # 파라미터에서 훈련용 반복 수와 조기 종료 조건을 따로 분리
    verbose_eval = params.pop('verbose_eval', 100)

    callbacks = []. # 콜백 설정
    if early_stop is not None:
        callbacks.append(lgb.early_stopping(stopping_rounds=early_stop))
    callbacks.append(lgb.log_evaluation(period=verbose_eval))

    # 모델 학습: 지정된 파라미터로 모델을 학습
    model = lgb.train(
        params,
        train_set=d_train,
        num_boost_round=num_rounds,
        valid_sets=[d_train, d_valid],
        categorical_feature=[],
        callbacks=callbacks
    )

    print('Predict 1/2')
    # 예측 및 최적화된 라운딩: LightGBM은 연속적인 수치를 예측, 이를 0~4 범주의 클래스에 맞게 OptimizedRounder로 라운딩 경계 최적화
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    optR = OptimizedRounder()
    optR.fit(pred_test_y, test_y)
    coefficients = optR.coefficients()
    pred_test_y_k = optR.predict(pred_test_y, coefficients)
    print("Valid Counts = ", Counter(test_y))
    print("Predicted Counts = ", Counter(pred_test_y_k))
    print("Coefficients = ", coefficients)
    qwk = quadratic_weighted_kappa(test_y, pred_test_y_k) # 성능 평가
    print("QWK = ", qwk)

    print('Predict 2/2')
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) # 테스트셋 에측값 반환

    return pred_test_y.reshape(-1, 1), pred_test_y2.reshape(-1, 1), model.feature_importance(), coefficients, qwk

# k-fold 교차검증
def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):
    kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    fold_splits = kf.split(train, target)
    cv_scores = []
    qwk_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0], 5))
    all_coefficients = np.zeros((5, 4))
    feature_importance_df = pd.DataFrame()
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/5')
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]

        params2 = params.copy()
        pred_val_y, pred_test_y, importances, coefficients, qwk = model_fn(
            dev_X, dev_y, val_X, val_y, test, params2
        )

        pred_full_test += pred_test_y
        pred_train[val_index] = pred_val_y
        all_coefficients[i-1, :] = coefficients

        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            qwk_scores.append(qwk)
            print(f"{label} cv score {i}: RMSE {cv_score} QWK {qwk}")

        # 피처 중요도 저장
        fold_importance_df = pd.DataFrame({
            'feature': train.columns.values,
            'importance': importances,
            'fold': i
        })
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        i += 1

    print(f"{label} cv RMSE scores : {cv_scores}")
    print(f"{label} cv mean RMSE score : {np.mean(cv_scores)}")
    print(f"{label} cv std RMSE score : {np.std(cv_scores)}")
    print(f"{label} cv QWK scores : {qwk_scores}")
    print(f"{label} cv mean QWK score : {np.mean(qwk_scores)}")
    print(f"{label} cv std QWK score : {np.std(qwk_scores)}")

    pred_full_test /= 5.0
    results = {
        'label': label,
        'train': pred_train,
        'test': pred_full_test,
        'cv': cv_scores,
        'qwk': qwk_scores,
        'importance': feature_importance_df,
        'coefficients': all_coefficients
    }
    return results



In [None]:
results = run_cv_model(train, test, target, runLGB, params, rmse, 'lgb')

print(results['cv'])            # RMSE 점수 리스트
print(results['qwk'])           # QWK 점수 리스트
print(results['test'][:5])      # 테스트 예측값 일부
print(results['importance'].head())  # 중요 변수 상위 일부


Started lgb fold 1/5
Prep LGB
Train LGB




Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.06404	valid_1's rmse: 1.10356
[200]	training's rmse: 1.00727	valid_1's rmse: 1.08255
[300]	training's rmse: 0.968135	valid_1's rmse: 1.07193
[400]	training's rmse: 0.935529	valid_1's rmse: 1.06608
[500]	training's rmse: 0.90798	valid_1's rmse: 1.06226
[600]	training's rmse: 0.883661	valid_1's rmse: 1.05963
[700]	training's rmse: 0.860985	valid_1's rmse: 1.05746
[800]	training's rmse: 0.840443	valid_1's rmse: 1.05662
[900]	training's rmse: 0.822229	valid_1's rmse: 1.0559
[1000]	training's rmse: 0.804272	valid_1's rmse: 1.05546
[1100]	training's rmse: 0.787124	valid_1's rmse: 1.05546
[1200]	training's rmse: 0.772389	valid_1's rmse: 1.05502
[1300]	training's rmse: 0.757888	valid_1's rmse: 1.05476
[1400]	training's rmse: 0.74384	valid_1's rmse: 1.05481
Early stopping, best iteration is:
[1359]	training's rmse: 0.749302	valid_1's rmse: 1.05463
Predict 1/2
Valid Counts =  Counter({4: 839, 2: 808, 3: 652, 1



Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.06618	valid_1's rmse: 1.10096
[200]	training's rmse: 1.01061	valid_1's rmse: 1.07587
[300]	training's rmse: 0.970853	valid_1's rmse: 1.06319
[400]	training's rmse: 0.941614	valid_1's rmse: 1.05592
[500]	training's rmse: 0.914875	valid_1's rmse: 1.05218
[600]	training's rmse: 0.890929	valid_1's rmse: 1.04952
[700]	training's rmse: 0.868036	valid_1's rmse: 1.04763
[800]	training's rmse: 0.848123	valid_1's rmse: 1.04637
[900]	training's rmse: 0.830253	valid_1's rmse: 1.04531
[1000]	training's rmse: 0.813292	valid_1's rmse: 1.04428
[1100]	training's rmse: 0.796547	valid_1's rmse: 1.04371
[1200]	training's rmse: 0.780144	valid_1's rmse: 1.04311
[1300]	training's rmse: 0.764669	valid_1's rmse: 1.04295
[1400]	training's rmse: 0.749321	valid_1's rmse: 1.04281
[1500]	training's rmse: 0.732932	valid_1's rmse: 1.04267
Early stopping, best iteration is:
[1426]	training's rmse: 0.745577	valid_1's rmse: 1.0426
Pre



Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.06627	valid_1's rmse: 1.10258
[200]	training's rmse: 1.01217	valid_1's rmse: 1.07781
[300]	training's rmse: 0.974816	valid_1's rmse: 1.06751
[400]	training's rmse: 0.943591	valid_1's rmse: 1.06173
[500]	training's rmse: 0.917407	valid_1's rmse: 1.05747
[600]	training's rmse: 0.892327	valid_1's rmse: 1.05398
[700]	training's rmse: 0.86926	valid_1's rmse: 1.05195
[800]	training's rmse: 0.847862	valid_1's rmse: 1.05044
[900]	training's rmse: 0.828488	valid_1's rmse: 1.04979
[1000]	training's rmse: 0.810992	valid_1's rmse: 1.049
[1100]	training's rmse: 0.794653	valid_1's rmse: 1.04811
[1200]	training's rmse: 0.77894	valid_1's rmse: 1.04812
Early stopping, best iteration is:
[1152]	training's rmse: 0.786159	valid_1's rmse: 1.04786
Predict 1/2
Valid Counts =  Counter({4: 840, 2: 807, 3: 652, 1: 618, 0: 82})
Predicted Counts =  Counter({np.float64(2.0): 1672, np.float64(4.0): 824, np.float64(3.0): 386, np.f



Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.062	valid_1's rmse: 1.10988
[200]	training's rmse: 1.007	valid_1's rmse: 1.0902
[300]	training's rmse: 0.968709	valid_1's rmse: 1.0798
[400]	training's rmse: 0.936879	valid_1's rmse: 1.07311
[500]	training's rmse: 0.909477	valid_1's rmse: 1.06867
[600]	training's rmse: 0.886036	valid_1's rmse: 1.06574
[700]	training's rmse: 0.865658	valid_1's rmse: 1.0641
[800]	training's rmse: 0.844436	valid_1's rmse: 1.06239
[900]	training's rmse: 0.825897	valid_1's rmse: 1.06161
[1000]	training's rmse: 0.807873	valid_1's rmse: 1.06121
[1100]	training's rmse: 0.792549	valid_1's rmse: 1.0611
Early stopping, best iteration is:
[1057]	training's rmse: 0.7987	valid_1's rmse: 1.061
Predict 1/2
Valid Counts =  Counter({4: 839, 2: 807, 3: 652, 1: 618, 0: 82})
Predicted Counts =  Counter({np.float64(2.0): 894, np.float64(3.0): 812, np.float64(4.0): 674, np.float64(1.0): 618})
Coefficients =  [0.47244286 2.08060868 2.485136



Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.06733	valid_1's rmse: 1.09817
[200]	training's rmse: 1.01301	valid_1's rmse: 1.07514
[300]	training's rmse: 0.97394	valid_1's rmse: 1.06406
[400]	training's rmse: 0.942166	valid_1's rmse: 1.05713
[500]	training's rmse: 0.916873	valid_1's rmse: 1.0525
[600]	training's rmse: 0.895746	valid_1's rmse: 1.04988
[700]	training's rmse: 0.875013	valid_1's rmse: 1.04814
[800]	training's rmse: 0.856348	valid_1's rmse: 1.04681
[900]	training's rmse: 0.838134	valid_1's rmse: 1.04612
[1000]	training's rmse: 0.819829	valid_1's rmse: 1.04594
[1100]	training's rmse: 0.80236	valid_1's rmse: 1.04556
[1200]	training's rmse: 0.785065	valid_1's rmse: 1.04491
[1300]	training's rmse: 0.770073	valid_1's rmse: 1.04475
Early stopping, best iteration is:
[1280]	training's rmse: 0.773011	valid_1's rmse: 1.04457
Predict 1/2
Valid Counts =  Counter({4: 839, 2: 807, 3: 652, 1: 618, 0: 82})
Predicted Counts =  Counter({np.float64(2.

In [None]:
importance_df = results['importance'].copy()
importance_df['importance'] = pd.to_numeric(importance_df['importance'], errors='coerce')

imports = importance_df.groupby('feature')[['importance']].mean().reset_index()
imports = imports.sort_values('importance', ascending=False)
print(imports.head())


      feature  importance
0         Age       975.6
1      Breed1       867.2
12   PhotoAmt       551.6
118    svd_70       435.6
13   Quantity       432.6


 OptimizedRounder를 사용해 LightGBM 모델의 연속형 예측 결과를 범주형 등급(0~4)으로 변환하고, 그 등급 분포를 확인

In [None]:
optR = OptimizedRounder()
coefficients_ = np.mean(results['coefficients'], axis=0)
print(coefficients_)
# manually adjust coefs
coefficients_[0] = 1.64
coefficients_[1] = 2.11
coefficients_[3] = 2.85
train_predictions = [r[0] for r in results['train']]
train_predictions = optR.predict(train_predictions, coefficients_).astype(int)
Counter(train_predictions)

[0.51058897 1.88948825 2.54912948 2.53194011]


Counter({np.int64(2): 4855,
         np.int64(3): 2940,
         np.int64(4): 3821,
         np.int64(1): 2972,
         np.int64(0): 405})

In [None]:
# 모델이 예측한 연속값을 등급(정수형태)로 변환 -> 점수를 최대화는 최적의 경계값을 찾기 위해 OptimizedRounder 이용
optR = OptimizedRounder()
coefficients_ = np.mean(results['coefficients'], axis=0)
print(coefficients_)
# manually adjust coefs
coefficients_[0] = 1.645
coefficients_[1] = 2.115
coefficients_[3] = 2.84
test_predictions = [r[0] for r in results['test']]
test_predictions = optR.predict(test_predictions, coefficients_).astype(int)
Counter(test_predictions)

[0.51058897 1.88948825 2.54912948 2.53194011]


Counter({np.int64(3): 796,
         np.int64(2): 1267,
         np.int64(1): 737,
         np.int64(4): 1124,
         np.int64(0): 48})

In [None]:
print("True Distribution:")
print(pd.value_counts(target, normalize=True).sort_index())
print("Test Predicted Distribution:")
print(pd.value_counts(test_predictions, normalize=True).sort_index())
print("Train Predicted Distribution:")
print(pd.value_counts(train_predictions, normalize=True).sort_index())

True Distribution:
AdoptionSpeed
0    0.027346
1    0.206096
2    0.269259
3    0.217368
4    0.279931
Name: proportion, dtype: float64
Test Predicted Distribution:
0    0.012085
1    0.185549
2    0.318983
3    0.200403
4    0.282981
Name: proportion, dtype: float64
Train Predicted Distribution:
0    0.027013
1    0.198226
2    0.323818
3    0.196092
4    0.254852
Name: proportion, dtype: float64


  print(pd.value_counts(target, normalize=True).sort_index())
  print(pd.value_counts(test_predictions, normalize=True).sort_index())
  print(pd.value_counts(train_predictions, normalize=True).sort_index())


In [None]:
pd.DataFrame(sk_cmatrix(target, train_predictions), index=list(range(5)), columns=list(range(5)))

Unnamed: 0,0,1,2,3,4
0,44,122,132,53,59
1,212,1072,1133,403,270
2,95,973,1596,766,607
3,44,557,1174,754,730
4,10,248,820,964,2155


In [None]:
quadratic_weighted_kappa(target, train_predictions)
rmse(target, [r[0] for r in results['train']])
submission = pd.DataFrame({'PetID': test_id, 'AdoptionSpeed': test_predictions})
submission.head()

Unnamed: 0,PetID,AdoptionSpeed
0,e2dfc2935,3
1,f153b465f,2
2,3c90f3f54,1
3,e02abc8a3,4
4,09f0df7d1,4
