In [9]:
!pip install catboost
!pip install lightgbm

[0m

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from scipy.stats import randint

from catboost import CatBoostRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import torch.nn as nn
from torch.optim import Adam
import matplotlib.pyplot as plt

import random
import os
import re
import warnings
import tqdm

import wandb

warnings.filterwarnings('ignore')

## Set SEED

In [3]:
SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

## Preprocessing

In [4]:
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6
    
def fillna_location(df):
    df['location_state'].replace(' ', np.nan, inplace=True)
    # location_city 컬럼의 결측치를 처리 -> 전체 최빈값
    mode_city = df['location_city'].mode()[0]
    df['location_city'].fillna(mode_city, inplace=True)

    # location_state 컬럼의 결측치를 처리
    # 먼저, 각 도시별 가장 빈번한 주를 계산
    state_by_city = df.groupby('location_city')['location_state'].apply(lambda x: x.mode()[0] if not x.mode().empty else 'Unknown')
    # 그리고 이 정보를 사용하여 결측치를 채움
    df['location_state'] = df.apply(lambda row: state_by_city[row['location_city']] if pd.isnull(row['location_state']) else row['location_state'], axis=1)

    # location_country 컬럼의 결측치를 처리
    # 먼저, 각 주별 가장 빈번한 국가를 계산
    country_by_state = df.groupby('location_state')['location_country'].apply(lambda x: x.mode()[0] if not x.mode().empty else 'Unknown')
    # 그리고 이 정보를 사용하여 결측치를 채움
    df['location_country'] = df.apply(lambda row: country_by_state[row['location_state']] if pd.isnull(row['location_country']) else row['location_country'], axis=1)
    return df

def convert_ISBN10_to_ISBN13(isbn10_series):
    def calculate_check_digit(isbn):
        # ISBN-13의 체크 디지트를 계산하는 함수
        isbn = '978' + isbn[:-1]  # ISBN-13으로 확장하기 위해 앞에 '978' 추가
        check = sum((3 if i % 2 == 0 else 1) * int(digit) for i, digit in enumerate(isbn))  # 체크 디지트 계산
        check_digit = (10 - (check % 10)) % 10  # 10의 배수로 만들기 위해 나머지 계산
        return str(check_digit)

    # ISBN-10을 ISBN-13으로 변환하는 함수
  
    if len(isbn10_series) != 10 or not isbn10_series[:-1].isdigit():
        return '0000000000'
    else:
        return '978' + isbn10_series[:-1] + calculate_check_digit(isbn10_series)

def get_language_from_isbn13(isbn13):
    """Get the representative language from the ISBN-13 country/region code."""
    # Extracting the country/region code from the ISBN-13
    # ISBN-13 starts with 978 or 979, followed by the country/region code
    
    country_code = int(isbn13[3:5])  # Using first two digits of the country/region code

    # Mapping of country/region code to representative language
    language_map = {
        0: 'en', 1: 'en',  # English
        2: 'fr',  # French
        3: 'de',  # German
        4: 'ja',  # Japanese
        5: 'ru',  # Russian
        7: 'zh',  # Chinese
        80: 'cs', 81: 'hi', 82: 'no', 83: 'pl', 84: 'es', 85: 'pt', 86: 'sr', 87: 'da', 88: 'it', 89: 'ko',
        90: 'nl', 91: 'sv'  # Dutch, Swedish
        # ... (other mappings can be added as needed)
    }
    
   
    return language_map.get(country_code, 'en')


def category_integration(books):
    '''
        category를 category_high로 병합
    '''
    books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
    books['category'] = books['category'].str.lower()

    categories = ['garden','crafts','physics','adventure','music','fiction','nonfiction','science','science fiction','social','homicide',
                'sociology','disease','religion','christian','philosophy','psycholog','mathemat','agricult','environmental',
                'business','poetry','drama','literary','travel','motion picture','children','cook','literature','electronic',
                'humor','animal','bird','photograph','computer','house','ecology','family','architect','camp','criminal','language','india']
    
    books['category'] = np.where(books['category'].isna(), np.random.choice(categories), books['category'])

    books['category_high'] = books['category'].copy()

    for category in categories:
        books.loc[books[books['category'].str.contains(category,na=False)].index,'category_high'] = category

    category_high_df = pd.DataFrame(books['category_high'].value_counts()).reset_index()
    category_high_df.columns = ['category','count']

    others_list = category_high_df[category_high_df['count']<5]['category'].values
    books.loc[books[books['category_high'].isin(others_list)].index, 'category_high']='others'

    return books

## Data Load

In [5]:
path = './data/'
users = pd.read_csv(path+'users.csv')
books = pd.read_csv(path+'books.csv')

train = pd.read_csv(path + 'train_ratings.csv')
test = pd.read_csv(path + 'test_ratings.csv')
sub = pd.read_csv(path + 'sample_submission.csv')

############################ apply preprocessing ############################
users['age'] = users['age'].fillna(int(users['age'].mean()))
users['age'] = users['age'].apply(age_map)
users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
users = users.drop(['location'], axis=1)
users = fillna_location(users)
books['isbn13'] = books['isbn'].apply(convert_ISBN10_to_ISBN13)
books.loc[books['language'].isna(), 'language'] = books[books['language'].isna()]['isbn13'].apply(get_language_from_isbn13)
books = category_integration(books)
#books['book_author'] = np.where(books['book_author'].isna(), books['book_author'].value_counts()[0], books['book_author'])
mode_author = books['book_author'].mode()[0]
books['book_author'].fillna(mode_author, inplace=True)
books = books.drop(columns='isbn13')
books = books.drop(columns='summary')
books = books.drop(columns='img_url')
books = books.drop(columns='img_path')
#############################################################################

users_ = users.copy()
books_ = books.copy()

train = pd.merge(train, users_, on='user_id', how='left')
test = pd.merge(test, users_, on='user_id', how='left')
sub = pd.merge(sub, users_, on='user_id', how='left')
train = pd.merge(train, books_, on='isbn', how='left')
test = pd.merge(test, books_, on='isbn', how='left')
sub = pd.merge(sub, books_, on='isbn', how='left')

train['year_of_publication'] = train['year_of_publication'].astype(int)
test['year_of_publication'] = test['year_of_publication'].astype(int)
sub['year_of_publication'] = sub['year_of_publication'].astype(int)

sub = sub.drop(columns='rating')
sub["rating"] = 0

data = {
    'train' : train,
    'test' : test.drop(['rating'], axis=1),
    'users' : users,
    'books' : books,
    'sub' : sub
    }

## LightGBM Model

In [6]:
params = {
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.8,
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'metric': 'rmse',
    'early_stopping_rounds': 100,
    'verbose':500
}

In [13]:
class LightGbm(nn.Module):
    def __init__(self, data, n_splits=5, random_seed=42, param=params):
        self.data = data
        self.n_splits = n_splits
        self.random_seed = random_seed
        self.X = data["train"].drop(["rating"], axis=1)
        self.y = data["train"]["rating"]
        self.param = params
        self.cat_features = self.X.select_dtypes(include=["object"]).columns.tolist()
        self.cat_features_idx = [
            self.X.columns.get_loc(col) for col in self.cat_features
        ]  # values이 후에 사용 -> lgb dataset, train 인자로 사용

        # label -> category
        for feature in self.cat_features:
            le = LabelEncoder()
            self.X[feature] = le.fit_transform(self.X[feature])
            self.X[feature] = self.X[feature].astype("category")

        self.sub = self.data["sub"].drop("rating", axis=1)
        # label -> category
        for feature in self.cat_features:
            le = LabelEncoder()
            self.sub[feature] = le.fit_transform(self.sub[feature])
            self.sub[feature] = self.sub[feature].astype("category")

        self.X = self.X.values # 데이터프레임에서 ndarray롤 바꾸기
        self.y = self.y.values

        # k-fold
        self.kfold = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_seed)

    def train(self):
        rmse_scores = []

        for train_idx, valid_idx in tqdm.tqdm(self.kfold.split(self.X, self.y)):  # tqdm.tqdm은 진행바 -> k-fold가 간격
            X_train_fold, X_valid_fold = (self.X[train_idx],self.X[valid_idx])
            y_train_fold, y_valid_fold = (self.y[train_idx],self.y[valid_idx])

            train_data = lgb.Dataset(X_train_fold,label=y_train_fold,categorical_feature=self.cat_features_idx)
            valid_data = lgb.Dataset(X_valid_fold,label=y_valid_fold,reference=train_data,categorical_feature=self.cat_features_idx)  # reference는 'valid_data'가 'train_data'를 기반한다는 의미
            self.model = lgb.train(self.param,train_data,valid_sets=[train_data, valid_data],categorical_feature=self.cat_features_idx)

            y_pred_fold = self.model.predict(X_valid_fold, num_iteration=self.model.best_iteration)
            rmse_fold = mean_squared_error(y_valid_fold, y_pred_fold, squared=False)
            wandb.log({"fold_loss": rmse_fold})
            rmse_scores.append(rmse_fold)

        mean_kfold = np.mean(rmse_scores)
        wandb.log({"train_loss": mean_kfold})
        return mean_kfold

    def predict(self):
        y_pred = self.model.predict(self.sub)
        self.sub["rating"] = y_pred

    def sample_submission(self):
        self.sub[["user_id", "isbn", "rating"]].to_csv("./submit/" + "lightgbm.csv")

    def feature_importance(self):
        result = self.model.get_feature_importance()
        result = pd.Series(result, index=self.data["X_train"].columns).sort_values()
        plt.figure(figsize=(14, 7))
        plt.barh(result.index, result.values)

        plt.title("LGB Feature Importance")
        plt.xlabel("Importance")
        plt.ylabel("Feature")

        plt.show()

## 학습

In [14]:
# sweep_id = wandb.sweep(sweep_config, project="level_1_catboost", entity='kjswon12')
# wandb.agent(sweep_id, run_sweep, count=10)

############## wandb initialization
wandb.init(project='level_1_LightGbm')

####################### WandB start run
wandb.run.name = 'LightGbm_v1'
wandb.run.save()

params = {
    'boosting_type': 'gbdt',
    'num_leaves': 40,
    'learning_rate': 0.13,
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'metric': 'rmse',
    'early_stopping_rounds': 100,
    'verbose':1000,
    'num_boost_round' : 1000
}

wandb.config = params

lightgbm = LightGbm(data, 5, 42, params)

lightgbm.train()

lightgbm.predict()



lightgbm.sample_submission()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
fold_loss,▁▆█▆▅
train_loss,▁

0,1
fold_loss,2.24041
train_loss,2.2403


0it [00:00, ?it/s]

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.149978
[LightGBM] [Debug] init for col-wise cost 0.000009 seconds, init for row-wise cost 0.004611 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007956 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48761
[LightGBM] [Info] Number of data points in the train set: 245436, number of used features: 13
[LightGBM] [Info] Start training from score 7.069696
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 36
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 34
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 33
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 36
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 32
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 24
[LightGBM] [Debug] Tr

1it [00:20, 20.90s/it]

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.150242
[LightGBM] [Debug] init for col-wise cost 0.000007 seconds, init for row-wise cost 0.004491 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48759
[LightGBM] [Info] Number of data points in the train set: 245436, number of used features: 13
[LightGBM] [Info] Start training from score 7.069717
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 36
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 29
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 29
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 29
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 34
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 24
[LightGBM] [Debug] Tr

2it [00:34, 16.60s/it]

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.150024
[LightGBM] [Debug] init for col-wise cost 0.000000 seconds, init for row-wise cost 0.005750 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48552
[LightGBM] [Info] Number of data points in the train set: 245436, number of used features: 13
[LightGBM] [Info] Start training from score 7.069733
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 34
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 34
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 36
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 31
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 33
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 23
[LightGBM] [Debug] Tr

3it [00:48, 15.56s/it]

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.150124
[LightGBM] [Debug] init for col-wise cost 0.000007 seconds, init for row-wise cost 0.004419 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48529
[LightGBM] [Info] Number of data points in the train set: 245436, number of used features: 13
[LightGBM] [Info] Start training from score 7.069713
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 37
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 36
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 27
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 30
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 35
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 23
[LightGBM] [Debug] Tr

4it [00:57, 12.87s/it]

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.150256
[LightGBM] [Debug] init for col-wise cost 0.000953 seconds, init for row-wise cost 0.008906 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48786
[LightGBM] [Info] Number of data points in the train set: 245436, number of used features: 13
[LightGBM] [Info] Start training from score 7.069713
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 33
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 36
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 34
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 35
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 31
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 23
[LightGBM] [Debug] Tr

5it [01:20, 16.16s/it]
