In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from haversine import haversine
import re
import gc
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

#!pip install pyarrow

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')
    
csv_to_parquet('input/train.csv', 'train')
csv_to_parquet('input/test.csv', 'test')    

## data load

In [2]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

## 데이터 전처리

In [3]:
# base_date
train_base_date_df = train.groupby('base_date').count()[['id']].reset_index()
test_base_date_df = test.groupby('base_date').count()[['id']].reset_index()
train_base_date_df.columns = ['base_date','base_date_cnt']
test_base_date_df.columns = ['base_date','base_date_cnt']

In [4]:
train = train.merge(train_base_date_df, how='inner', on='base_date')
test = test.merge(test_base_date_df, how='inner', on='base_date')

In [5]:
# one-hot
cols = ['day_of_week','road_type', 'start_turn_restricted', 'end_turn_restricted'] # , 'road_name'
train = pd.get_dummies(train, prefix_sep='_', sparse=False, drop_first=True, columns = cols)
test = pd.get_dummies(test, prefix_sep='_', sparse=False, drop_first=True, columns = cols)

In [6]:
# 위경도로 거리 도출
train["distance"] = train[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")
test["distance"] = test[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")

In [7]:
# del 
del_cols = ['id','height_restricted','vehicle_restricted', 'start_node_name', 'end_node_name', 'base_date', 'road_name']
train = train.drop(del_cols, axis=1)
test = test.drop(del_cols, axis=1)

## 모델링

In [8]:
gc.collect()

40

In [9]:
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [10]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(train.drop('target', axis=1) , train['target'] ,test_size=0.2, shuffle=True, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3760973, 22), (940244, 22), (3760973,), (940244,))

In [11]:
%%time 
LR = lgb.LGBMRegressor(random_state=42).fit(X_train, y_train)

Wall time: 28.2 s


## 예측

In [12]:
pred = LR.predict(X_test)

In [13]:
mean_absolute_error(y_test, pred)

4.69978454405773

## 제출

In [14]:
sample_submission = pd.read_csv('input/sample_submission.csv')

In [15]:
LR2 = lgb.LGBMRegressor(random_state=42).fit(train.drop('target', axis=1), train['target'])

In [16]:
sample_submission['target'] = LR2.predict(test)

In [17]:
sample_submission.to_csv("output/20221023-1.csv", index = False)