## **Summary from outlier**

1. headshotrate > 0.5
2. damagedealt >= 4000
3. kills > 60
4. killstreak > 10
5. walkdistance > 7.5k
6. weaponacquired > 20
7. heals > 40
8. boosts > 20
9. totaldistance == 0 & kills > 0
10. walkdistance == 0 & kills > 0
11. ridedistance == 0 & roadkills > 0
12. weaponacquired == 0 & winplaceperc > 0.5
13. heals == 0 & winplaceperc > 0.8
14. heals and boosts == 0 & winplaceperc > 0.8
15. one NaN in target value

In [1]:
import os, time, gc
import pandas as pd, numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.listdir('input')

['sample_submission_V2.csv', 'test_V2.csv', 'train_V2.csv']

In [3]:
%%time
tr = pd.read_csv("input/train_V2.csv")
te = pd.read_csv("input/test_V2.csv")

Wall time: 12.4 s


In [4]:
def missing_values_table(df):# Function to calculate missing values by column# Funct 
    mis_val = df.isnull().sum() # Total missing values
    mis_val_pct = 100 * df.isnull().sum() / len(df)# Percentage of missing values
    mis_val_df = pd.concat([mis_val, mis_val_pct], axis=1)# Make a table with the results
    mis_val_df_cols = mis_val_df.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})# Rename the columns
    mis_val_df_cols = mis_val_df_cols[mis_val_df_cols.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)# Sort the table by percentage of missing descending
    print ("Dataframe has " + str(df.shape[1]) + " columns.\n" 
           "There are " + str(mis_val_df_cols.shape[0]) + " cols having missing values.")# Print some summary information
    return mis_val_df_cols # Return the dataframe with missing information

In [5]:
missing_values_table(tr)

Dataframe has 29 columns.
There are 1 cols having missing values.


Unnamed: 0,Missing Values,% of Total Values
winPlacePerc,1,0.0


In [6]:
missing_values_table(te)

Dataframe has 28 columns.
There are 0 cols having missing values.


Unnamed: 0,Missing Values,% of Total Values


In [7]:
tr[tr['winPlacePerc'].isnull()]

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
2744604,f70c74418bb064,12dfbede33f92b,224a123c53e008,0,0,0.0,0,0,0,1,...,0,0.0,0,0.0,0,0,0.0,0,0,


In [8]:
tr.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')

In [9]:
tr.drop(2744604, inplace=True)

## **Make Basic Features**

In [10]:
def base(df):
    df['headshot_rate'] = df['headshotKills']/df['kills']
    df['roadkill_rate'] = df['roadKills']/df['kills']
    
    df['total_distance'] = df[df.filter(regex='Dist').columns.tolist()].sum(axis=1)
    df['heals_n_boosts'] = df[['heals','boosts']].sum(axis=1)
    df['walkdist_rate'] = df['walkDistance']/df['total_distance']
    df['swimdist_rate'] = df['swimDistance']/df['total_distance']
    df['ridedist_rate'] = df['rideDistance']/df['total_distance']
    
    df['kills_n_assists'] = df['kills'] + df['assists']
    df['kills_n_assists_n_revives'] = df['kills'] + df['assists'] + df['revives']
    df['kills_n_assists_knockouts'] = df['kills_n_assists'] - df['DBNOs']
    df['kills_n_assists_n_revives_knockouts'] = df['kills_n_assists_n_revives'] - df['DBNOs']
    
    return df

In [11]:
tr = base(tr)
te = base(te)

In [12]:
tr['is_outlier'] = np.where(tr['headshot_rate']> 0.5, 1, 0)
tr['is_outlier'] = np.where(tr['damageDealt']>=4000, 1, 0)
tr['is_outlier'] = np.where(tr['kills']> 60, 1, 0)
tr['is_outlier'] = np.where(tr['killStreaks']> 10, 1, 0)
tr['is_outlier'] = np.where(tr['walkDistance']> 7500, 1, 0)
tr['is_outlier'] = np.where(tr['weaponsAcquired']> 20, 1, 0)
tr['is_outlier'] = np.where(tr['heals']> 40, 1, 0)
tr['is_outlier'] = np.where(tr['boosts']> 20, 1, 0)
tr['is_outlier'] = np.where((tr['total_distance']==0)&(tr['kills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['walkDistance']==0)&(tr['kills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['rideDistance']==0)&(tr['roadKills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['weaponsAcquired']==0)&(tr['winPlacePerc']>0.5), 1, 0)
tr['is_outlier'] = np.where((tr['heals']==0)&(tr['winPlacePerc']>0.8), 1, 0)
tr['is_outlier'] = np.where((tr['heals_n_boosts']==0)&(tr['winPlacePerc']>0.8), 1, 0)

In [13]:
print("The total number of outliers")
print("n = {}".format(tr['is_outlier'].sum()))
print("% = {}".format(tr['is_outlier'].sum()/tr.shape[0]))

The total number of outliers
n = 81343
% = 0.01829180126220917


In [14]:
print(tr.shape)
tr = tr[tr['is_outlier']==0]
print(tr.shape)

(4446965, 41)
(4365622, 41)


# **Feature Engineering**

In [15]:
def reduce_mem_usage(df):
    # iterate through all the columns of a dataframe and modify the data type
    #   to reduce memory usage.        
    
    start_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [16]:
agg_cols = tr.drop(['Id','groupId','matchId','matchType','winPlacePerc','is_outlier'],axis=1).columns.tolist()
by_cols = ['matchType','groupId','matchId']
agg_ls = ['mean','max','min','std']

In [17]:
len(by_cols)*len(agg_ls)*len(agg_cols)*2

840

In [18]:
def FE(df):
    for BY in by_cols:
        for col in tqdm(agg_cols):
            for agg in agg_ls:
                new_col = "{}_{}_{}".format(BY, col, agg)
                diff_col = "{}_{}_{}_diff".format(BY, col, agg)

                df[new_col] = df.groupby([BY])[col].transform(agg).reset_index(drop=True).rename(columns={"{}".format(col):"{}_{}".format(BY, col)})
                df[diff_col] = df[new_col] - df[col]
            df = reduce_mem_usage(df)
    return df

In [19]:
tr.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,total_distance,heals_n_boosts,walkdist_rate,swimdist_rate,ridedist_rate,kills_n_assists,kills_n_assists_n_revives,kills_n_assists_knockouts,kills_n_assists_n_revives_knockouts,is_outlier
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,244.8,0,1.0,0.0,0.0,0,0,0,0,0
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,1445.0445,0,0.992357,0.00764,3e-06,0,0,0,0,0
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,161.8,0,1.0,0.0,0.0,1,1,1,1,0
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,202.7,0,1.0,0.0,0.0,0,0,0,0,0
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,49.75,0,1.0,0.0,0.0,1,1,1,1,0


In [20]:
%%time
tr = FE(tr)
te = FE(te)

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [08:49<00:00, 15.12s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [36:36<00:00, 62.75s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [37:36<00:00, 64.46s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [02:48<00:00,  4.81s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [12:04<00:00, 20.69s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [11:31<00:00, 19.75s/it]

Wall time: 1h 49min 25s





In [21]:
tr.shape, te.shape

((4365622, 881), (1934174, 879))

# Build Model

**Train model w/o outliers**

In [25]:
# exclude target and outlier columns
base_feat = tr.drop(['Id','groupId','matchId','winPlacePerc','is_outlier'],axis=1).columns.tolist()
train_Y = tr['winPlacePerc']

In [26]:
len(base_feat)

876

Make object type to category for lightgbm model to take.

In [27]:
tr[base_feat].dtypes.value_counts()

float16    819
float64     32
int8        19
int16        5
object       1
dtype: int64

In [28]:
tr[base_feat].select_dtypes(include=['object']).columns

Index(['matchType'], dtype='object')

In [29]:
tr['matchType'] = tr['matchType'].astype('category')
te['matchType'] = te['matchType'].astype('category')

Now object type changed to category

In [30]:
tr[base_feat].dtypes.value_counts()

float16     819
float64      32
int8         19
int16         5
category      1
dtype: int64

In [31]:
import time
from sklearn import metrics
from operator import itemgetter
import lightgbm as lgb

In [32]:
from sklearn.model_selection import train_test_split
def LGB_HOLDOUT_REG(size_ratio, train_X, test_X, metric, is_shuffle):
    X_tr,X_val,y_tr,y_val = train_test_split(train_X, train_Y, train_size = size_ratio, shuffle=is_shuffle)

    val_lgb = np.zeros(len(X_val))
    predictions = np.zeros(len(test_X))
    
    # Model parameters
    lgb_params = {'num_leaves': 31,
                 'min_data_in_leaf': 20, 
                 'objective':'regression',
                 'max_depth': -1,
                 'learning_rate': 0.01,
                 "boosting": "gbdt",
                 "feature_fraction": 1,
                 "bagging_freq": 1,
                 "bagging_fraction": 1,
                 "bagging_seed": 42,
                 "metric": metric,
                 "lambda_l1": 0.0,
                 "verbosity": 100,
                 "nthread": -1,
                 "random_state": 42}

    model_start = time.time()

    model = lgb.LGBMRegressor(**lgb_params, n_estimators = 20000, n_jobs = -1)
    model.fit(X_tr, 
              y_tr, 
              eval_set=[(X_tr, y_tr), (X_val, y_val)], 
              eval_metric=metric,
              verbose=100, 
              early_stopping_rounds=200)
    val_lgb = model.predict(X_val, num_iteration=model.best_iteration_)

    cv_score = model.best_score_

    #feature importance
    feature_importance_df = pd.DataFrame()
    feature_importance_df["Feature"] = train_X.columns
    feature_importance_df["importance"] = model.feature_importances_[:len(train_X.columns)]

    #predictions
    predictions = model.predict(test_X, num_iteration=model.best_iteration_)

    print("-" * 50)
    print("HOLD_OUT "+ metric + " = {}".format(cv_score))
    lgb.plot_metric(model, metric=metric, title='auc plot', xlabel='Iterations', ylabel='auto', figsize=(10,8), grid=False)

    model_end = time.time()
    model_elapsed = model_end - model_start
    print('Model elapsed {0:0.2f}'.format(model_elapsed/60), "minutes.")

    # FEATURE IMPORTANCE
    pd.set_option('display.max_rows', 500)
    feature_importance_df['Feature Rank'] = feature_importance_df['importance'].rank(ascending=0)
    feature_importance_df = feature_importance_df.sort_values('Feature Rank', ascending = True)
    print(feature_importance_df.loc[feature_importance_df['importance']!=0].head(100))

    return predictions, val_lgb, cv_score, feature_importance_df

In [None]:
pred, val, cv, feat = LGB_HOLDOUT_REG(0.8, tr[base_feat], te[base_feat], 'l1', is_shuffle=True)