In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
base_path = '/content/drive/MyDrive/modelingPUBG/data/featured_data/'

In [None]:
train_1 = reduce_mem_usage(pd.read_csv(base_path + 'featured_train_1.csv')) ## 1차 전처리 데이터
train_2 = reduce_mem_usage(pd.read_csv(base_path + 'featured_train_2.csv')) ## 2차 전처리 데이터
train_3 = reduce_mem_usage(pd.read_csv(base_path + 'featured_train_3.csv')) ## 3차 전처리 데이터

In [None]:
train_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446965 entries, 0 to 4446964
Data columns (total 21 columns):
 #   Column           Dtype  
---  ------           -----  
 0   assists          int8   
 1   boosts           int8   
 2   damageDealt      float16
 3   DBNOs            int8   
 4   headshotKills    int8   
 5   heals            int8   
 6   killPlace        int8   
 7   killPoints       int16  
 8   kills            int8   
 9   killStreaks      int8   
 10  longestKill      float16
 11  maxPlace         int8   
 12  numGroups        int8   
 13  revives          int8   
 14  rideDistance     float16
 15  swimDistance     float16
 16  vehicleDestroys  int8   
 17  walkDistance     float16
 18  weaponsAcquired  int16  
 19  winPoints        int16  
 20  winPlacePerc     float16
dtypes: float16(6), int16(3), int8(12)
memory usage: 127.2 MB


In [None]:
train_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446965 entries, 0 to 4446964
Data columns (total 18 columns):
 #   Column           Dtype  
---  ------           -----  
 0   assists          int8   
 1   boosts           int8   
 2   damageDealt      float16
 3   DBNOs            int8   
 4   headshotKills    int8   
 5   heals            int8   
 6   killPlace        int8   
 7   kills            int8   
 8   longestKill      float16
 9   revives          int8   
 10  rideDistance     float16
 11  swimDistance     float16
 12  vehicleDestroys  int8   
 13  walkDistance     float16
 14  weaponsAcquired  int16  
 15  winPlacePerc     float16
 16  matchType2       int8   
 17  cat_numGroups    int8   
dtypes: float16(6), int16(1), int8(11)
memory usage: 106.0 MB


In [7]:
train_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023437 entries, 0 to 2023436
Data columns (total 19 columns):
 #   Column               Dtype  
---  ------               -----  
 0   assists              float16
 1   DBNOs                float16
 2   headshotKills        float16
 3   killStreaks          float16
 4   longestKill          float16
 5   numGroups            float16
 6   rankPoints           float16
 7   revives              float16
 8   damageDealtNorm      float16
 9   killsNorm            float16
 10  totalDistance        float16
 11  matchType            float16
 12  cat_vehicleDestroys  float16
 13  cat_weaponsAcquired  float16
 14  killsWithoutMoving   float16
 15  cat_walkDistance     float16
 16  binary_rideAndswim   float16
 17  heals_boosts         float16
 18  winPlacePerc         float16
dtypes: float16(19)
memory usage: 73.3 MB


# 3차 Modeling
    - 팀원들이 EDA를 통해 파악한 인사이트로 데이터 전처리를 진행한 파일(3차)와 간단한 데이터 전처리를 진행한 파일(1차, 2차) 모델링 진행

## train, test split

In [None]:
from sklearn.model_selection import train_test_split

def check_error(model_, data_):

    X = data_.drop(columns = 'winPlacePerc')
    y = data_['winPlacePerc']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    model = model_
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    return mean_absolute_error(y_test, pred)

## ML model

In [None]:
# ML model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor


from sklearn.metrics import mean_absolute_error

In [None]:
# LinearRegression

result_1 = check_error(LinearRegression(), train_1)
result_2 = check_error(LinearRegression(), train_2)
result_3 = check_error(LinearRegression(), train_3)

print(f"MAE of LinearRegression_1: {result_1:.4f}")
print(f"MAE of LinearRegression_2: {result_2:.4f}")
print(f"MAE of LinearRegression_3: {result_3:.4f}")

MAE of LinearRegression_1: 0.0968
MAE of LinearRegression_2: 0.1049
MAE of LinearRegression_3: 0.1060


In [None]:
# Lasso

result_1 = check_error(Lasso(), train_1)
result_2 = check_error(Lasso(), train_2)
result_3 = check_error(Lasso(), train_3)

print(f"MAE of Lasso_1: {result_1:.4f}")
print(f"MAE of Lasso_2: {result_2:.4f}")
print(f"MAE of Lasso_3: {result_3:.4f}")

MAE of Lasso_1: 0.1261
MAE of Lasso_2: 0.1261
MAE of Lasso_3: 0.1656


In [None]:
# Ridge

result_1 = check_error(Ridge(), train_1)
result_2 = check_error(Ridge(), train_2)
result_3 = check_error(Ridge(), train_3)

print(f"MAE of Ridge_1: {result_1:.4f}")
print(f"MAE of Ridge_2: {result_2:.4f}")
print(f"MAE of Ridge_3: {result_3:.4f}")

MAE of Ridge_1: 0.0968
MAE of Ridge_2: 0.1049
MAE of Ridge_3: 0.1060


In [None]:
# LGBMRegressor

result_1 = check_error(LGBMRegressor(), train_1)
result_2 = check_error(LGBMRegressor(), train_2)
result_3 = check_error(LGBMRegressor(), train_3)

print(f"MAE of LGBMRegressor_1: {result_1:.4f}")
print(f"MAE of LGBMRegressor_2: {result_2:.4f}")
print(f"MAE of LGBMRegressor_3: {result_3:.4f}")


MAE of LGBMRegressor_1: 0.0638
MAE of LGBMRegressor_2: 0.0681
MAE of LGBMRegressor_3: 0.0808


In [None]:
# XGBRegressor

result_1 = check_error(XGBRegressor(), train_1)
result_2 = check_error(XGBRegressor(), train_2)
result_3 = check_error(XGBRegressor(), train_3)

print(f"MAE of XGBRegressor_1: {result_1:.4f}")
print(f"MAE of XGBRegressor_2: {result_2:.4f}")
print(f"MAE of XGBRegressor_3: {result_3:.4f}")

MAE of XGBRegressor_1: 0.0721
MAE of XGBRegressor_2: 0.0741
MAE of XGBRegressor_3: 0.0839
