[View in Colaboratory](https://colab.research.google.com/github/mondchopers/kaggle_PUBG/blob/master/Exploration.ipynb)

# PUBG Competition Data Exploration

## Loading Kaggle API and Download data to Colab Storage

In [0]:
# Install kaggle API, go to my Google Drive and get API key so that I can
# access dataset
# https://medium.com/@move37timm/d18645f93648

!pip install kaggle
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

# Optional piece of code: sometime just need to put the key in the right place
!mkdir ~/.kaggle
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json

In [0]:
# # List current active competitions
# !kaggle competitions list

# # Download PUBG data
# !kaggle competitions download -c pubg-finish-placement-prediction -p /content/kaggle

# List down all files from PUBG competition
import os
os.chdir('/content/kaggle')
!ls

# # Unzip all files
# for file in os.listdir():
#   !unzip {file}
# !ls

In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
import gc

print(tf.test.gpu_device_name())
print(tf.__version__)

In [0]:
# Mount Google Drive Storage to Store Models Later On
from google.colab import drive
drive.mount('/content/gdrive')

## Helper Functions

## Exploration

In [0]:
dtypes = {
        'Id'                : 'str',
        'groupId'           : 'str',
        'matchId'           : 'str',
        'assists'           : 'uint8',
        'boosts'            : 'uint8',
        'damageDealt'       : 'float32',
        'DBNOs'             : 'uint8',
        'headshotKills'     : 'uint8', 
        'heals'             : 'uint8',    
        'killPlace'         : 'uint8',    
        'killPoints'        : 'uint8',    
        'kills'             : 'uint8',    
        'killStreaks'       : 'uint8',    
        'longestKill'       : 'float32',    
        'maxPlace'          : 'uint8',
        'matchDuration'     : 'uint32',
        'matchType'         : 'str',
        'numGroups'         : 'uint8',
        'rankPoints'        : 'int64',
        'revives'           : 'uint8',    
        'rideDistance'      : 'float32',    
        'roadKills'         : 'uint8',    
        'swimDistance'      : 'float32',    
        'teamKills'         : 'uint8',    
        'vehicleDestroys'   : 'uint8',    
        'walkDistance'      : 'float32',    
        'weaponsAcquired'   : 'uint8',    
        'winPoints'         : 'uint8', 
        'winPlacePerc'      : 'float32' 
}

df_train = pd.read_csv('train_V2.csv',dtype=dtypes)
print('Training rows: ' + str(len(df_train)))
# df_tests = pd.read_csv('test_V2.csv',dtype=dtypes)
# print('Testing rows: ' + str(len(df_tests)))

In [0]:
# Read training data, print out some information from there
trainDf = pd.read_csv('train_V2.csv')
print(trainDf.head(3))
print(len(trainDf))
print(trainDf.dtypes)

In [0]:
# Going through groups and see how many players are inside each group
for i in np.unique(trainDf['groupId'].values):
  print(i, len(trainDf[trainDf['groupId'] == i]))
  if i > 10:
    break
   
print(trainDf[trainDf['groupId'] == 11])

## Using XGB for Regression

In [0]:
# Explicitly choose columns for X and Y axis
XCol = list(trainDf.columns)
XCol.remove('Id')
XCol.remove('groupId')
XCol.remove('matchId')
# Split into train and validation
trainLen = int(0.75 * len(trainDf))
seed = np.random.permutation(len(trainDf))

trainX = trainDf[XCol].iloc[seed[trainLen:]].reset_index(drop=True)
trainY = trainDf['winPlacePerc'].iloc[seed[trainLen:]].reset_index(drop=True)
validX = trainDf[XCol].iloc[seed[:trainLen]].reset_index(drop=True)
validY = trainDf['winPlacePerc'].iloc[seed[:trainLen]].reset_index(drop=True)

In [0]:
model = xgb.XGBRegressor(n_estimators=50, learning_rate=0.01, subsample=0.75,
                           colsample_bytree=1, max_depth=4)
model.fit(trainX, trainY)
print(model.score(validX, validY))

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

validZ = model.predict(validX)
print(validZ[0], validY.values[0])
plt.scatter(validY.values, validZ-validY.values)

## Stephen's Code

In [0]:
# Differentiation between data with rankPoints (old API) and without (new API)

print(len(df_train[df_train['rankPoints'] < 0]))
print(len(df_train[df_train['rankPoints'] >= 0]))
print(len(df_train))

# print(len(df_tests[df_tests['rankPoints'] < 0]))
# print(len(df_tests[df_tests['rankPoints'] >= 0]))
# print(len(df_tests))

In [0]:
dtypes = {
        'Id'                : 'str',
        'groupId'           : 'str',
        'matchId'           : 'str',
        'assists'           : 'uint8',
        'boosts'            : 'uint8',
        'damageDealt'       : 'float32',
        'DBNOs'             : 'uint8',
        'headshotKills'     : 'uint8', 
        'heals'             : 'uint8',    
        'killPlace'         : 'uint8',    
        'killPoints'        : 'uint8',    
        'kills'             : 'uint8',    
        'killStreaks'       : 'uint8',    
        'longestKill'       : 'float32',    
        'maxPlace'          : 'uint8',
        'matchDuration'     : 'uint32',
        'matchType'         : 'str',
        'numGroups'         : 'uint8',
        'rankPoints'        : 'int64',
        'revives'           : 'uint8',    
        'rideDistance'      : 'float32',    
        'roadKills'         : 'uint8',    
        'swimDistance'      : 'float32',    
        'teamKills'         : 'uint8',    
        'vehicleDestroys'   : 'uint8',    
        'walkDistance'      : 'float32',    
        'weaponsAcquired'   : 'uint8',    
        'winPoints'         : 'uint8', 
        'winPlacePerc'      : 'float32' 
}

df_train = pd.read_csv('train_V2.csv',dtype=dtypes)
print('Training rows: ' + str(len(df_train)))
print(len(df_train.columns), df_train.columns)

In [0]:
# Constructing Full Data Frame with some additional feature shaping
# Firstly, seperate columns into objective data like damage stats (mean columns)
# Then compute group&match mean, group std and group size as features
# and append to full df

# Using new API, filter for rankPoints < 0
df_train = df_train[df_train['rankPoints'] < 0]
mean_columns = list(df_train.columns)
mean_columns.remove('rankPoints')
print(len(mean_columns), mean_columns)

# # Using old API, filter for rankPoints >= 0
# df_train = df_train[df_train['rankPoints'] >= 0]
# mean_columns = list(df_train.columns)
# mean_columns.remove('killPoints')
# mean_columns.remove('winPoints')
# print(len(mean_columns), mean_columns)

match_columns = ['Id', 'matchId','groupId', 'maxPlace', 'numGroups', 'winPlacePerc']
_ = [mean_columns.remove(x) for x in match_columns]
print(len(mean_columns), mean_columns)
_ = [mean_columns.remove(x) for x in ['longestKill']]
target = 'winPlacePerc'
print(mean_columns)
print(match_columns)
print(target)

df_full = df_train[match_columns].drop_duplicates()
print(len(df_full.columns), df_full.columns)
df_groups = df_train.groupby(['matchId','groupId'])
del df_train
gc.collect()

# Group & match mean data
df_groups_mean = df_groups[mean_columns].mean().fillna(0).add_suffix('_mean').reset_index()
print(len(df_groups_mean.columns), df_groups_mean.columns)
df_match_pct = df_groups_mean.groupby(['matchId']).rank(pct=True).add_suffix('_match_pct')
df_groups_mean[list(df_match_pct.columns)] = df_match_pct
print(len(df_groups_mean.columns), df_groups_mean.columns)
df_match_mean = df_groups_mean.groupby(['matchId']).mean().fillna(0).add_suffix('_match_mean').reset_index()
df_match_std = df_groups_mean.groupby(['matchId']).std().fillna(0).add_suffix('_match_std').reset_index()
df_full = df_full.merge(df_groups_mean,how='left',on=['matchId','groupId'])
df_full = df_full.merge(df_match_mean,how='left',on=['matchId'])
df_full = df_full.merge(df_match_std,how='left',on=['matchId'])
del df_match_pct
del df_groups_mean
del df_match_mean
del df_match_std
gc.collect()
print(len(df_full.columns), df_full.columns)

# # Group std and count data
df_groups_std = df_groups[mean_columns].std().fillna(0).add_suffix('_std').reset_index()
df_groups_size = df_groups.size().reset_index(name='group_size')
df_full = df_full.merge(df_groups_std,how='left',on=['matchId','groupId'])
df_full = df_full.merge(df_groups_size,how='left',on=['matchId','groupId'])
del df_groups
del df_groups_std
del df_groups_size
gc.collect()
print(df_full.head())
print(list(df_full.columns))

In [0]:
# Constructing X and Y matrices

# Filtering through the columns to be used
feature_columns = list(df_full.columns)
print(len(feature_columns), feature_columns)
for item in match_columns:
  feature_columns.remove(item)
print(len(feature_columns), feature_columns)
train_x = df_full[feature_columns].values
train_y = df_full[target].values
del df_full
gc.collect()

# Shuffling
order = np.argsort(np.random.random(train_y.shape))
train_x = train_x[order]
train_y = train_y[order]

# Normalize Training Data
train_mean = train_x.mean(axis=0)
train_std = train_x.std(axis=0)
train_x = (train_x - train_mean) / train_std
train_x = np.nan_to_num(train_x)

print(train_x.shape, train_y.shape)
print(train_x[0,:]) # first X example
print(train_y[0]) # first Y example

In [0]:
# Neural Network Model

def build_model(inputShape):
  model = keras.Sequential([
    keras.layers.Dense(256, activation=tf.nn.relu,
                       input_shape=inputShape),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(64, activation=tf.nn.relu),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(8, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid)
  ])

  optimizer = tf.train.RMSPropOptimizer(0.001)
#   optimizer = tf.train.AdamOptimizer()

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae'])
  return model

model = build_model(inputShape=(train_x.shape[1],))
model.summary()

In [0]:
import matplotlib.pyplot as plt
import pickle

def plot_history(history, saveLoc=''):
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [1000$]')
  plt.plot(history.epoch, np.array(history.history['mean_absolute_error']),
           label='Train Loss')
  plt.plot(history.epoch, np.array(history.history['val_mean_absolute_error']),
           label = 'Val loss')
  plt.legend()
#   plt.ylim([0, .02])
  
  # Save Plot
  if len(saveLoc) > 0:
    plt.savefig(saveLoc)
    
def save_history(history, saveLoc):
  historyDf = pd.DataFrame()
  historyDf = historyDf.assign(Epoch = np.array(history.epoch))
  for el in history.history.keys():
    historyDf = historyDf.assign(**{el : history.history[el]})
  historyDf = historyDf.set_index('Epoch')
  historyDf.to_csv(saveLoc)

# Display training progress by printing a single dot for each completed epoch
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 10 == 0: print('')
    print('.', end='')

EPOCHS = 60
gdrive = '/content/gdrive/My Drive/Colab Notebooks/Kaggle_PUBG/'
modelName = 'Kaggle_PUBG_model_new_001'

# Saving model normalization:
with open(gdrive + modelName + '_params.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([train_mean, train_std], f)

# Store training stats
history = model.fit(train_x, train_y, epochs=EPOCHS,
                    validation_split=0.2, verbose=1, batch_size=512)
plot_history(history)

# Saving model
model.save(gdrive + modelName + '.h5')

# Saving history
save_history(history, gdrive + modelName + '.png' )

In [0]:
# Checking model prediction
print(model.predict(train_x[0, :].reshape(1, -1)), train_y[0])

In [0]:
# Reload Model

loaded_model = keras.models.load_model('/content/gdrive/My Drive/Colab Notebooks/kaggle_PUBG_model_new_001.h5')

optimizer = tf.train.RMSPropOptimizer(0.001)
loaded_model.compile(loss='mse',
              optimizer=optimizer,
              metrics=['mae'])

# Store training stats
history = model.fit(train_data, train_labels, epochs=2,
                    validation_split=0.2, verbose=1, batch_size = 128)

## Making Prediction

In [0]:
dtypes = {
        'Id'                : 'str',
        'groupId'           : 'str',
        'matchId'           : 'str',
        'assists'           : 'uint8',
        'boosts'            : 'uint8',
        'damageDealt'       : 'float32',
        'DBNOs'             : 'uint8',
        'headshotKills'     : 'uint8', 
        'heals'             : 'uint8',    
        'killPlace'         : 'uint8',    
        'killPoints'        : 'uint8',    
        'kills'             : 'uint8',    
        'killStreaks'       : 'uint8',    
        'longestKill'       : 'float32',    
        'maxPlace'          : 'uint8',
        'matchDuration'     : 'uint32',
        'matchType'         : 'str',
        'numGroups'         : 'uint8',
        'rankPoints'        : 'int64',
        'revives'           : 'uint8',    
        'rideDistance'      : 'float32',    
        'roadKills'         : 'uint8',    
        'swimDistance'      : 'float32',    
        'teamKills'         : 'uint8',    
        'vehicleDestroys'   : 'uint8',    
        'walkDistance'      : 'float32',    
        'weaponsAcquired'   : 'uint8',    
        'winPoints'         : 'uint8', 
        'winPlacePerc'      : 'float32' 
}

df_tests = pd.read_csv('test_V2.csv', dtype=dtypes)
print('Testing rows: ' + str(len(df_tests)))

In [0]:
df_tests['Status'] = df_tests['rankPoints'] >= 0
print(df_tests.head())

In [0]:
import pickle
gdrive = '/content/gdrive/My Drive/Colab Notebooks/Kaggle_PUBG/'
modelName = 'Kaggle_PUBG_model_new_001'

# Using new API, filter for rankPoints < 0
df_tests = df_tests[df_tests['rankPoints'] < 0]
mean_columns = list(df_tests.columns)
mean_columns.remove('rankPoints')

# # Using old API, filter for rankPoints >= 0
# df_tests = df_tests[df_tests['rankPoints'] >= 0]
# print(len(df_tests.columns), df_tests.columns)
# mean_columns = list(df_tests.columns)
# mean_columns.remove('killPoints')
# mean_columns.remove('winPoints')

match_columns = ['Id', 'matchId','groupId', 'maxPlace', 'numGroups']
_ = [mean_columns.remove(x) for x in match_columns]
print(len(mean_columns), mean_columns)
_ = [mean_columns.remove(x) for x in ['longestKill']]
target = 'Id'
print(mean_columns)
print(match_columns)
print(target)

df_full = df_tests[match_columns].drop_duplicates()
print(len(df_full.columns), df_full.columns)
df_groups = df_tests.groupby(['matchId','groupId'])
del df_tests
gc.collect()

# Group & match mean data
df_groups_mean = df_groups[mean_columns].mean().fillna(0).add_suffix('_mean').reset_index()
df_match_pct = df_groups_mean.groupby(['matchId']).rank(pct=True).add_suffix('_match_pct')
df_groups_mean[list(df_match_pct.columns)] = df_match_pct
df_match_mean = df_groups_mean.groupby(['matchId']).mean().fillna(0).add_suffix('_match_mean').reset_index()
df_match_std = df_groups_mean.groupby(['matchId']).std().fillna(0).add_suffix('_match_std').reset_index()
df_full = df_full.merge(df_groups_mean,how='left',on=['matchId','groupId'])
df_full = df_full.merge(df_match_mean,how='left',on=['matchId'])
df_full = df_full.merge(df_match_std,how='left',on=['matchId'])
del df_match_pct
del df_groups_mean
del df_match_mean
del df_match_std
gc.collect()
print(len(df_full.columns), df_full.columns)

# # Group std and count data
df_groups_std = df_groups[mean_columns].std().fillna(0).add_suffix('_std').reset_index()
df_groups_size = df_groups.size().reset_index(name='group_size')
df_full = df_full.merge(df_groups_std,how='left',on=['matchId','groupId'])
df_full = df_full.merge(df_groups_size,how='left',on=['matchId','groupId'])
del df_groups
del df_groups_std
del df_groups_size
gc.collect()
print(df_full.head())
print(list(df_full.columns))

# Construct tests x
feature_columns = list(df_full.columns)
print(len(feature_columns), feature_columns)
for item in match_columns:
  feature_columns.remove(item)
print(len(feature_columns), feature_columns)
tests_x = df_full[feature_columns].values
tests_z = df_full[target].values
print(tests_x.shape, tests_z.shape)
del df_full
gc.collect()

# Normalize Training Data
with open(gdrive + modelName + '_params.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    [train_mean, train_std] = pickle.load(f)
tests_x = (tests_x - train_mean) / train_std
tests_x = np.nan_to_num(tests_x)

# Make prediction
model = keras.models.load_model(gdrive + modelName + '.h5')
tests_y = model.predict(tests_x)
print(tests_y.shape)

# Construct Prediction DataFrame and Save Result
tests_result_df = pd.DataFrame()
tests_result_df['Id'] = tests_z
tests_result_df['winPlacePerc'] = tests_y
print(tests_result_df.head())
print(len(tests_result_df))
tests_result_df.to_csv(gdrive + modelName + '_result.csv', index=False)