## **Summary from outlier**

1. headshotrate > 0.5
2. damagedealt >= 4000
3. kills > 60
4. killstreak > 10
5. walkdistance > 7.5k
6. weaponacquired > 20
7. heals > 40
8. boosts > 20
9. totaldistance == 0 & kills > 0
10. walkdistance == 0 & kills > 0
11. ridedistance == 0 & roadkills > 0
12. weaponacquired == 0 & winplaceperc > 0.5
13. heals == 0 & winplaceperc > 0.8
14. heals and boosts == 0 & winplaceperc > 0.8
15. one NaN in target value

In [52]:
import os, time, gc, random
import pandas as pd, numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.listdir('input')

['sample_submission_V2.csv', 'test_V2.csv', 'train_V2.csv']

In [3]:
%%time
tr = pd.read_csv("input/train_V2.csv")
te = pd.read_csv("input/test_V2.csv")

Wall time: 25.8 s


In [4]:
def missing_values_table(df):# Function to calculate missing values by column# Funct 
    mis_val = df.isnull().sum() # Total missing values
    mis_val_pct = 100 * df.isnull().sum() / len(df)# Percentage of missing values
    mis_val_df = pd.concat([mis_val, mis_val_pct], axis=1)# Make a table with the results
    mis_val_df_cols = mis_val_df.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})# Rename the columns
    mis_val_df_cols = mis_val_df_cols[mis_val_df_cols.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)# Sort the table by percentage of missing descending
    print ("Dataframe has " + str(df.shape[1]) + " columns.\n" 
           "There are " + str(mis_val_df_cols.shape[0]) + " cols having missing values.")# Print some summary information
    return mis_val_df_cols # Return the dataframe with missing information

In [5]:
missing_values_table(tr)

Dataframe has 29 columns.
There are 1 cols having missing values.


Unnamed: 0,Missing Values,% of Total Values
winPlacePerc,1,0.0


In [6]:
missing_values_table(te)

Dataframe has 28 columns.
There are 0 cols having missing values.


Unnamed: 0,Missing Values,% of Total Values


In [7]:
tr[tr['winPlacePerc'].isnull()]

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
2744604,f70c74418bb064,12dfbede33f92b,224a123c53e008,0,0,0.0,0,0,0,1,...,0,0.0,0,0.0,0,0,0.0,0,0,


In [8]:
tr.drop(2744604, inplace=True)

In [9]:
def base(df):
    df['headshot_rate'] = df['headshotKills']/df['kills']
    df['total_distance'] = df[df.filter(regex='Dist').columns.tolist()].sum(axis=1)
    df['heals_n_boosts'] = df[['heals','boosts']].sum(axis=1)
    return df

In [10]:
tr = base(tr)
te = base(te)

In [11]:
tr['is_outlier'] = np.where(tr['headshot_rate']> 0.5, 1, 0)
tr['is_outlier'] = np.where(tr['damageDealt']>=4000, 1, 0)
tr['is_outlier'] = np.where(tr['kills']> 60, 1, 0)
tr['is_outlier'] = np.where(tr['killStreaks']> 10, 1, 0)
tr['is_outlier'] = np.where(tr['walkDistance']> 7500, 1, 0)
tr['is_outlier'] = np.where(tr['weaponsAcquired']> 20, 1, 0)
tr['is_outlier'] = np.where(tr['heals']> 40, 1, 0)
tr['is_outlier'] = np.where(tr['boosts']> 20, 1, 0)
tr['is_outlier'] = np.where((tr['total_distance']==0)&(tr['kills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['walkDistance']==0)&(tr['kills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['rideDistance']==0)&(tr['roadKills']!=0), 1, 0)
tr['is_outlier'] = np.where((tr['weaponsAcquired']==0)&(tr['winPlacePerc']>0.5), 1, 0)
tr['is_outlier'] = np.where((tr['heals']==0)&(tr['winPlacePerc']>0.8), 1, 0)
tr['is_outlier'] = np.where((tr['heals_n_boosts']==0)&(tr['winPlacePerc']>0.8), 1, 0)

In [12]:
print("The total number of outliers")
print("n = {}".format(tr['is_outlier'].sum()))
print("% = {}".format(tr['is_outlier'].sum()/tr.shape[0]))

The total number of outliers
n = 81343
% = 0.01829180126220917


# Build Model

**Train model w/o outliers**

In [13]:
print(tr.shape)
tr = tr[tr['is_outlier']==0]
print(tr.shape)

(4446965, 33)
(4365622, 33)


In [16]:
tr[base_feat].select_dtypes(include=['object']).columns

Index(['matchType'], dtype='object')

**make object dtype to ohe**

In [26]:
tr = pd.concat([tr,pd.get_dummies(tr['matchType'])], axis=1)
te = pd.concat([te,pd.get_dummies(te['matchType'])], axis=1)
tr.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,normal-duo,normal-duo-fpp,normal-solo,normal-solo-fpp,normal-squad,normal-squad-fpp,solo,solo-fpp,squad,squad-fpp
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0,0,0,0,0,0,0,0,1
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0,0,0,0,0,0,0,0,1
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0,0,0,0,0,0,0,0,0
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0,0,0,0,0,0,0,0,1
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0,0,0,0,0,0,1,0,0


#### **Scaling**

In [43]:
# exclude target and outlier columns
base_feat = tr.columns[3:-21].tolist()+tr.columns[29:32].tolist()+tr.columns[33:].tolist()
train_Y = tr['winPlacePerc']

Make object type to category for lightgbm model to take.

In [44]:
tr[base_feat].dtypes.value_counts()

int64      20
uint8      16
float64     7
object      1
dtype: int64

In [45]:
tr[base_feat].columns

Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints',
       'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'headshot_rate', 'total_distance', 'heals_n_boosts', 'crashfpp',
       'crashtpp', 'duo', 'duo-fpp', 'flarefpp', 'flaretpp', 'normal-duo',
       'normal-duo-fpp', 'normal-solo', 'normal-solo-fpp', 'normal-squad',
       'normal-squad-fpp', 'solo', 'solo-fpp', 'squad', 'squad-fpp'],
      dtype='object')

In [46]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
tr_X = scaler.fit_transform(tr[base_feat].drop(['matchType'],axis=1))
te_X = scaler.transform(te[base_feat].drop(['matchType'],axis=1))

In [49]:
tr_X = pd.DataFrame(tr_X)
tr_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,-0.397433,-0.653651,-0.767703,-0.576252,-0.378447,-0.517255,0.440453,1.172702,-0.595752,-0.76631,...,-0.006615,-0.035094,-0.008413,-0.018761,-0.010541,-0.061388,-0.206225,-0.370437,-0.404875,1.23799
1,-0.397433,-0.653651,-0.232835,-0.576252,-0.378447,-0.517255,0.331714,-0.804599,-0.595752,-0.76631,...,-0.006615,-0.035094,-0.008413,-0.018761,-0.010541,-0.061388,-0.206225,-0.370437,-0.404875,1.23799
2,1.302641,-0.653651,-0.370075,-0.576252,-0.378447,-0.517255,-0.030751,-0.804599,-0.595752,-0.76631,...,-0.006615,-0.035094,-0.008413,-0.018761,-0.010541,-0.061388,-0.206225,-0.370437,-0.404875,-0.807761
3,-0.397433,-0.653651,-0.575321,-0.576252,-0.378447,-0.517255,0.984151,-0.804599,-0.595752,-0.76631,...,-0.006615,-0.035094,-0.008413,-0.018761,-0.010541,-0.061388,-0.206225,-0.370437,-0.404875,1.23799
4,-0.397433,-0.653651,-0.182956,-0.576252,-0.378447,-0.517255,-0.103244,-0.804599,0.045543,0.638162,...,-0.006615,-0.035094,-0.008413,-0.018761,-0.010541,-0.061388,-0.206225,2.699516,-0.404875,-0.807761


### **Build NN Model**

In [70]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from torch.nn.utils.weight_norm import weight_norm
from torch.optim import Optimizer
from sklearn import preprocessing
from sklearn.model_selection import KFold, train_test_split
import joblib

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 42
seed_everything(SEED)

In [58]:
def PubgDataset(x, y):
    return TensorDataset(
        torch.from_numpy(x).float(),
        torch.from_numpy(y).float()
    )

In [60]:
weight_norm

<function torch.nn.utils.weight_norm.weight_norm(module, name='weight', dim=0)>

In [61]:
def build_loader(x, y, batch_size, shuffle, num_workers):
    dataset = PubgDataset(x, y)
    return DataLoader(
        dataset, batch_size = batch_size, shuffle=shuffle, num_workers=num_workers
    )

In [62]:
class MLP(nn.Module):
    def __init__(self, num_features):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            weight_norm(nn.Linear(num_features, 128)),
            nn.ReLU(),
            weight_norm(nn.Linear(128, 128)),
            nn.ReLU(),
            weight_norm(nn.Linear(128, 128)),
            nn.ReLU(),
            weight_norm(nn.Linear(128, 128)),
            nn.ReLU(),
            weight_norm(nn.Linear(128, 1))
            
        )
        
    def forward(self, x):
        out = self.model(x)
        return out
        

In [68]:
train_Y = tr['winPlacePerc']

In [71]:
X_tr, X_val, y_tr, y_val = train_test_split(tr_X, train_Y, train_size = 0.8, shuffle=True)

In [None]:
%%time
train_loader = build_loader(X_tr, y_tr, batch_size=256, shuffle=True)
valid_loader = build_loader(X_val, y_val, batch_size=256, shuffle=False)

In [75]:
num_features=X_tr.shape[1]
print(num_features)

43


In [56]:
device = 'cuda:0'
use_gpu = cuda.is_available()
if use_gpu:
    print("enable gpu use")
else:
    print("enable cpu for debugging")

Using cuda:0


In [67]:
model = MLP(num_features=num_features).to(device)
optimizer = optim.Adam(model.parameters(), lr, weight_decay=0.00025)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)
criterion = nn.L1Loss()

def to_numpy(t):
    return t.cpu().detach().numpy()

best_mae_score = 9999
EPOCH = 100
model_fname = '../PUBG/weight/mlp_v1.pt'

log_df = pd.DataFrame(columns = ['epoch_idx', 'train_loss', 'valid_loss'])

print("Started..")

for epoch_idx in range(1, EPOCH+1):
    start_time = time.time()
    
    train_loss = 0
    model.train()
    optimizer.zero_grad()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        if use_gpu:
            data = data.to(device)
            target = target.to(device)
        output = model(data)
        loss = criterion(output, target)
        
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        
        train_loss += loss.item()/len(train_loader)
    
    model.eval()
    valid_loss = 0
    
    with torch.no_grad():
        for batch_idx, (data, target) in enumerata(valid_loader):
            if use_gpu:
            data = data.to(device)
            target = target.to(device)
        output = model(data)
        loss = criterion(output, target)
        
        predict_vector = to_numpy(output)
        
        valid_loss += loss.item() / len(valid_loader)
    
    elapsed = time.time() - start_time
    
    # CHECK POINT
    if valid_loss < best_mae_score:
        best_mae_score = valid_loss
        torch.save(model.state_dict(), model_fname)
        print("================ ༼ つ ◕_◕ ༽つ BEST epoch : {}, MOF : {} ".format(epoch_idx, best_mae_score))
        
    lr = [_['lr'] for _ in optimizer.param_groups]
    
    #if args.scheduler == 'plateau':
    scheduler.step(valid_loss)
    #else:
    #    scheduler.step()
    print("E {}/{} tr_loss: {:.5f} val_loss: {:.5f} lr: {:.6f} elapsed: {:.0f}".format(
        epoch_idx, EPOCH, train_loss, valid_loss lr[0], elapsed))
    
    log_data = [epoch_idx, train_loss, valid_loss]
#     log.append(log_data)
    log_df.loc[epoch_idx] = log_data