## Import library

In [1]:
import numpy as np
import pandas as pd
import os, zipfile
from tqdm.auto import notebook_tqdm
import pyarrow.parquet as pq
import pyarrow as pa
from pyarrow import csv

import category_encoders as ce
import lightgbm as lgb

In [4]:
def dataset_extract(file_name) :
    with zipfile.ZipFile(file_name, 'r') as zip_ref :
        file_list = zip_ref.namelist()
        if os.path.exists(f'{file_name[-8:-4]}/') :
            print(f'데이터셋 폴더가 이미 존재합니다.')
            return

        else :
            for f in notebook_tqdm(file_list, desc='Extracting', unit='files') :
                zip_ref.extract(member=f, path='data/')

dataset_extract('open.zip')

Extracting:   0%|          | 0/3 [00:00<?, ?files/s]

## CSV to Parquet

In [10]:
def csv_to_parquet(csv_file_path, parquet_file_path, chunksize=50000, compression='snappy'):
    parquet_writer = None

    for chunk in notebook_tqdm(pd.read_csv(csv_file_path, chunksize=chunksize)):
        # 청크를 Parquet 테이블로 변환
        table = pa.Table.from_pandas(chunk)

        # 첫 번째 청크에서 ParquetWriter를 초기화
        if parquet_writer is None:
            parquet_writer = pq.ParquetWriter(parquet_file_path, table.schema, compression=compression)

        # 현재 청크를 Parquet 파일에 씁니다.
        parquet_writer.write_table(table)

    # ParquetWriter를 닫습니다.
    if parquet_writer is not None:
        parquet_writer.close()


def read_parquet_in_chunks(parquet_file_path, chunksize=500000):
    # Parquet 파일의 메타데이터를 읽어옵니다.
    parquet_file = pq.ParquetFile(parquet_file_path)

    # 총 로우 수를 가져옵니다.
    total_rows = parquet_file.metadata.num_rows

    # 청크 단위로 데이터를 읽어올 때마다 저장할 리스트
    df_list = []

    # 청크 단위로 데이터를 읽어옵니다.
    for start_row in notebook_tqdm(range(0, total_rows, chunksize)):
        # 현재 청크의 끝 로우를 계산합니다.
        end_row = min(start_row + chunksize, total_rows)

        # 현재 청크를 읽어옵니다.
        table = parquet_file.read_row_group(start_row // chunksize)
        df_chunk = table.to_pandas()

        # 읽어온 청크를 리스트에 추가합니다.
        df_list.append(df_chunk)

    # 모든 청크를 결합하여 하나의 데이터프레임으로 만듭니다.
    df = pd.concat(df_list, ignore_index=True)
    return df

### Load data

In [15]:
csv_file_path = '/kaggle/input/train.csv'
parquet_file_path = '/kaggle/working/train.parquet'
pq.write_table(csv.read_csv(csv_file_path), parquet_file_path)

In [2]:
csv_file_path = '/kaggle/input/test.csv'
parquet_file_path = '/kaggle/working/test.parquet'
pq.write_table(csv.read_csv(csv_file_path), parquet_file_path)

In [2]:
parquet_file_path = '/kaggle/working/train.parquet'
train = pd.read_parquet(parquet_file_path, engine = 'pyarrow')
train.shape

(28605391, 41)

In [3]:
parquet_file_path = '/kaggle/working/test.parquet'
test = pd.read_parquet(parquet_file_path, engine = 'pyarrow')
test.shape

(4538541, 40)

### Reduce Memory & Save

In [3]:
def reduce_memory_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in notebook_tqdm(df.columns):
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage became: ",mem_usg," MB")
    
    return df

In [4]:
train.drop('ID', axis=1,  inplace=True)

In [5]:
train = reduce_memory_usage(train)

Memory usage of dataframe is 8729.67 MB


  0%|          | 0/40 [00:00<?, ?it/s]

Memory usage became:  3244.068123817444  MB


In [None]:
test.drop('ID', axis=1, inplace=True)

In [7]:
test = reduce_memory_usage(test)

Memory usage of dataframe is 1350.43 MB


  0%|          | 0/39 [00:00<?, ?it/s]

Memory usage became:  562.7004432678223  MB


In [9]:
train.shape, test.shape

((28605391, 40), (4538541, 39))

## Data preprocessing

In [7]:
X_train = train.drop('Click', axis=1)
y_train = train['Click']

In [8]:
for col in notebook_tqdm(X_train.columns):
    if X_train[col].isnull().sum() != 0:
        X_train[col].fillna(0, inplace=True)

  0%|          | 0/39 [00:00<?, ?it/s]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(0, inplace=True)


In [None]:
encoding_target = list(X_train.dtypes[X_train.dtypes == "category"].index)

enc = ce.TargetEncoder(cols=encoding_target)
X_train_encoded = enc.fit_transform(X_train, y_train)

In [None]:
test_encoded = enc.transform(test)

In [9]:
X_train_encoded.shape, y_train.shape

Unnamed: 0,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,1,31,1103694,114.0,1,1,2662246,3422221,49115,3,...,2961319,7791058,380.0,2.0,1,12647707,0.0,479960,0.0,1238
1,706,3565,276062,26.0,706,43,3519,941357,105154,728,...,2961319,7791058,466.0,1.0,706,12647707,19.0,253713,0.0,14265
2,3036378,3044559,5788795,119.0,3036378,0,3069,672410,571012,3036378,...,82144,3036340,197.0,0.0,3036378,12647707,8.0,571012,0.0,23671
3,7249,85535,5218517,15.0,7249,26,102,244137,1719,40046,...,72869,7791058,8640.0,0.0,12018,12647707,14.0,27692,0.0,2304
4,5,18246,5218517,13.0,5,20,2662246,941357,3119,898,...,2961319,435789,41774.0,0.0,860,12647707,13.0,15545,0.0,3905


In [16]:
X_train_encoded.to_parquet('/kaggle/working/X_train.parquet', engine = 'pyarrow', index = False)
y_train.to_frame().to_parquet('/kaggle/working/y_train.parquet', engine = 'pyarrow', index = False)

## Load test data & Preprocess

In [20]:
# test = pd.read_csv('/kaggle/input/dacon-web-click-data/open/test.csv')
# test_x = test.drop(columns=['ID'])
for col in notebook_tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        test_x[col].fillna(0, inplace=True)


  0%|          | 0/39 [00:00<?, ?it/s]

In [21]:
X_test_encoded = reduce_memory_usage(X_test_encoded)

Memory usage of dataframe is 1350.43 MB


  0%|          | 0/39 [00:00<?, ?it/s]

Memory usage became:  588.6475601196289  MB


In [22]:
X_test_encoded.to_parquet('/kaggle/working/X_test.parquet', engine = 'pyarrow', index = False)