## Import library

In [2]:
import numpy as np
import pandas as pd
import os, zipfile
from tqdm.auto import notebook_tqdm
import pyarrow.parquet as pq
import pyarrow as pa

import category_encoders as ce
import lightgbm as lgb

## Load prepared data

In [3]:
# Parquet 파일 로드
X_train_path = '/kaggle/input/dacon-web-click-data/prepare_data/X_train.parquet'
y_train_path = '/kaggle/input/dacon-web-click-data/prepare_data/y_train.parquet'
X_test_path = '/kaggle/input/dacon-web-click-data/prepare_data/X_test.parquet'


X_train = pd.read_parquet(X_train_path, engine = 'pyarrow')
y_train = pd.read_parquet(y_train_path, engine = 'pyarrow')
X_test = pd.read_parquet(X_test_path, engine = 'pyarrow')

X_train.shape, y_train.shape, X_test.shape

((14350000, 39), (14350000, 1), (4538541, 39))

In [3]:
df = df.drop('ID',axis=1)

### Reduce Memory

In [4]:
def reduce_memory_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in notebook_tqdm(df.columns):
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage became: ",mem_usg," MB")
    
    return df

In [5]:
df = reduce_memory_usage(df)

Memory usage of dataframe is 8729.67 MB


  0%|          | 0/40 [00:00<?, ?it/s]

Memory usage became:  3244.0679864883423  MB


### Train

In [7]:
target = 'Click'
X_train = df.drop(target, axis=1)
y_train = df[target]

In [None]:
model = lgb.LGBMClassifier(objective = 'binary', device = 'gpu', verbose = -1, random_state=42)
model.fit(X_train, y_train, eval_metric='AUC')

### Load test data

In [None]:
def load_data():
    df = pd.read_csv('test.csv')
    df = df.drop('ID',axis=1)
    df['F04'] = df['F04'].fillna(0)
    df['F11'] = df['F11'].fillna(0)
    df['F18'] = df['F18'].fillna(0)
    df['F19'] = df['F19'].fillna(0)
    df['F24'] = df['F24'].fillna(0)
    df['F27'] = df['F27'].fillna(0)
    df['F29'] = df['F29'].fillna(0)
    df['F32'] = df['F32'].fillna(0)
    df['F33'] = df['F33'].fillna(0)
    df['F36'] = df['F36'].fillna(0)
    df['F38'] = df['F38'].fillna(0)
    df = df.fillna('NAN')
    float_columns = df.select_dtypes(include=['float64']).columns
    df[float_columns] = df[float_columns].astype('int64')
    object_columns = df.select_dtypes(include=['object']).columns
    df[object_columns] = df[object_columns].astype('category')
    return df


test_df = load_data()

### Infer

In [None]:
pred = model.predict_proba(test_df)

### Submit

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

sample_submission['Click'] = pred[:,1]
sample_submission

sample_submission.to_csv('lgbm.csv', index=False)