[View in Colaboratory](https://colab.research.google.com/github/mondchopers/kaggle_PUBG/blob/master/Exploration.ipynb)

# PUBG Competition Data Exploration

## Loading Kaggle API and Download data to Colab Storage

In [1]:
# Install kaggle API, go to my Google Drive and get API key so that I can
# access dataset
# https://medium.com/@move37timm/d18645f93648

!pip install kaggle
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

# Optional piece of code: sometime just need to put the key in the right place
!mkdir ~/.kaggle
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json

Collecting kaggle
[?25l  Downloading https://files.pythonhosted.org/packages/c6/78/832b9a9ec6b3baf8ec566e1f0a695f2fd08d2c94a6797257a106304bfc3c/kaggle-1.4.7.1.tar.gz (52kB)
[K    100% |████████████████████████████████| 61kB 2.8MB/s 
Collecting python-slugify (from kaggle)
  Downloading https://files.pythonhosted.org/packages/00/ad/c778a6df614b6217c30fe80045b365bfa08b5dd3cb02e8b37a6d25126781/python-slugify-1.2.6.tar.gz
Collecting Unidecode>=0.04.16 (from python-slugify->kaggle)
[?25l  Downloading https://files.pythonhosted.org/packages/59/ef/67085e30e8bbcdd76e2f0a4ad8151c13a2c5bce77c85f8cad6e1f16fb141/Unidecode-1.0.22-py2.py3-none-any.whl (235kB)
[K    100% |████████████████████████████████| 235kB 5.8MB/s 
[?25hBuilding wheels for collected packages: kaggle, python-slugify
  Running setup.py bdist_wheel for kaggle ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/44/2c/df/22a6eeb780c36c28190faef6252b739fdc47145fd87a6642d4
  Running setup.py bdist_wheel for

In [2]:
# List current active competitions
!kaggle competitions list

# Download PUBG data
!kaggle competitions download -c pubg-finish-placement-prediction -p /content/kaggle

# List down all files from PUBG competition
import os
os.chdir('/content/kaggle')
!ls

# Unzip all files
for file in os.listdir():
  !unzip {file}
!ls

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       2651           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge       9861           False  
house-prices-advanced-regression-techniques    2030-01-01 00:00:00  Getting Started  Knowledge       4199            True  
imagenet-object-localization-challenge         2029-12-31 07:00:00  Research         Knowledge         26           False  
pubg-finish-placement-prediction               2019-01-30 23:59:00  Playground            Swag         52            True  
human-protein-atlas-image-classification       2019-01-10 23:59:00  Featured           $37,000        345            True  
two-sigm

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
import gc

print(tf.test.gpu_device_name())
print(tf.__version__)

/device:GPU:0
1.12.0-rc1


## Helper Functions

In [0]:
def cleanup(obj):
  del(obj)
  gc.collect()

## Exploration

In [12]:
dtypes = {
        'Id'                : 'str',
        'groupId'           : 'str',
        'matchId'           : 'str',
        'assists'           : 'uint8',
        'boosts'            : 'uint8',
        'damageDealt'       : 'float32',
        'DBNOs'             : 'uint8',
        'headshotKills'     : 'uint8', 
        'heals'             : 'uint8',    
        'killPlace'         : 'uint8',    
        'killPoints'        : 'uint8',    
        'kills'             : 'uint8',    
        'killStreaks'       : 'uint8',    
        'longestKill'       : 'float32',    
        'maxPlace'          : 'uint8',
        'matchDuration'     : 'uint32',
        'matchType'         : 'str',
        'numGroups'         : 'uint8',
        'rankPoints'        : 'int64',
        'revives'           : 'uint8',    
        'rideDistance'      : 'float32',    
        'roadKills'         : 'uint8',    
        'swimDistance'      : 'float32',    
        'teamKills'         : 'uint8',    
        'vehicleDestroys'   : 'uint8',    
        'walkDistance'      : 'float32',    
        'weaponsAcquired'   : 'uint8',    
        'winPoints'         : 'uint8', 
        'winPlacePerc'      : 'float32' 
}

df_train = pd.read_csv('train_V2.csv',dtype=dtypes)
print('Training rows: ' + str(len(df_train)))

Training rows: 4446966


In [10]:
# Read training data, print out some information from there
trainDf = pd.read_csv('train_V2.csv')
print(trainDf.head(3))
print(len(trainDf))
print(trainDf.dtypes)

               Id         groupId         matchId  assists  boosts  \
0  7f96b2f878858a  4d4b580de459be  a10357fd1a4a91        0       0   
1  eef90569b9d03c  684d5656442f9e  aeb375fc57110c        0       0   
2  1eaf90ac73de72  6a4a42c3245a74  110163d8bb94ae        1       0   

   damageDealt  DBNOs  headshotKills  heals  killPlace      ...       revives  \
0         0.00      0              0      0         60      ...             0   
1        91.47      0              0      0         57      ...             0   
2        68.00      0              0      0         47      ...             0   

   rideDistance  roadKills  swimDistance  teamKills vehicleDestroys  \
0        0.0000          0          0.00          0               0   
1        0.0045          0         11.04          0               0   
2        0.0000          0          0.00          0               0   

   walkDistance  weaponsAcquired  winPoints  winPlacePerc  
0         244.8                1       1466      

In [0]:
# Going through groups and see how many players are inside each group
for i in np.unique(trainDf['groupId'].values):
  print(i, len(trainDf[trainDf['groupId'] == i]))
  if i > 10:
    break
   
print(trainDf[trainDf['groupId'] == 11])

## Using XGB for Regression

In [0]:
# Explicitly choose columns for X and Y axis
XCol = list(trainDf.columns)
XCol.remove('Id')
XCol.remove('groupId')
XCol.remove('matchId')
# Split into train and validation
trainLen = int(0.75 * len(trainDf))
seed = np.random.permutation(len(trainDf))

trainX = trainDf[XCol].iloc[seed[trainLen:]].reset_index(drop=True)
trainY = trainDf['winPlacePerc'].iloc[seed[trainLen:]].reset_index(drop=True)
validX = trainDf[XCol].iloc[seed[:trainLen]].reset_index(drop=True)
validY = trainDf['winPlacePerc'].iloc[seed[:trainLen]].reset_index(drop=True)

In [0]:
model = xgb.XGBRegressor(n_estimators=50, learning_rate=0.01, subsample=0.75,
                           colsample_bytree=1, max_depth=4)
model.fit(trainX, trainY)
print(model.score(validX, validY))

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

validZ = model.predict(validX)
print(validZ[0], validY.values[0])
plt.scatter(validY.values, validZ-validY.values)

## Stephen's Code

In [15]:
print(df_train[['matchId', 'groupId', 'winPoints']].groupby(['matchId',]).rank().head())

   groupId  winPoints
0     26.5       36.0
1     42.5       46.0
2     41.5       49.5
3     54.0       46.0
4     84.0       49.0


In [35]:
# 'Id'                : 'str',
# 'groupId'           : 'str',
# 'matchId'           : 'str',
# 'assists'           : 'uint8',
# 'boosts'            : 'uint8',
# 'damageDealt'       : 'float32',
# 'DBNOs'             : 'uint8',
# 'headshotKills'     : 'uint8', 
# 'heals'             : 'uint8',    
# 'killPlace'         : 'uint8',    
# 'killPoints'        : 'uint8',    
# 'kills'             : 'uint8',    
# 'killStreaks'       : 'uint8',    
# 'longestKill'       : 'float32',    
# 'maxPlace'          : 'uint8',
# 'matchDuration'     : 'uint32',
# 'matchType'         : 'str',
# 'numGroups'         : 'uint8',
# 'rankPoints'        : 'int64',
# 'revives'           : 'uint8',    
# 'rideDistance'      : 'float32',    
# 'roadKills'         : 'uint8',    
# 'swimDistance'      : 'float32',    
# 'teamKills'         : 'uint8',    
# 'vehicleDestroys'   : 'uint8',    
# 'walkDistance'      : 'float32',    
# 'weaponsAcquired'   : 'uint8',    
# 'winPoints'         : 'uint8', 
# 'winPlacePerc'      : 'float32' 

mean_columns = list(df_train.columns)
k = [mean_columns.remove(x) for x in ['Id', 'matchId','groupId', 'longestKill',
                                      'matchType', 'maxPlace', 'numGroups', 
                                      'winPlacePerc']]
match_columns = ['matchId','groupId', 'maxPlace', 'numGroups', 'winPlacePerc']
target = 'winPlacePerc'

df_groups = df_train.groupby(['matchId','groupId'])
df_full = df_train[match_columns].drop_duplicates()
cleanup(df_train)

df_groups_mean = df_groups[mean_columns].mean().fillna(0).add_suffix('_mean').reset_index()
df_match_perc = df_groups_mean.groupby(['matchId']).rank(pct=True).add_suffix('_match_perc')
df_groups_mean[list(df_match_perc.columns)] = df_match_perc


df_groups_std = df_groups[mean_columns].std().fillna(0).add_suffix('_std').reset_index()
df_groups_size = df_groups.size().reset_index(name='group_size')
print(df_groups_size.head())

df_full = df_full.merge(df_groups_mean,how='left',on=['matchId','groupId'])
df_full = df_full.merge(df_groups_std,how='left',on=['matchId','groupId'])
df_full = df_full.merge(df_groups_size,how='left',on=['matchId','groupId'])
cleanup(df_groups)

df_match_mean = df_groups_mean.groupby(['matchId']).mean().fillna(0).add_suffix('_match_mean').reset_index()
df_match_std = df_groups_mean.groupby(['matchId']).std().fillna(0).add_suffix('_match_std').reset_index()

df_full = df_full.merge(df_match_mean,how='left',on=['matchId'])
df_full = df_full.merge(df_match_std,how='left',on=['matchId'])
cleanup(df_match_mean)
cleanup(df_match_std)

print(df_full.head())
print(list(df_full.columns))

          matchId         groupId  group_size
0  0000a43bce5eec  18b16ec699d8b6           2
1  0000a43bce5eec  236ab9e9c081b9           6
2  0000a43bce5eec  3a6addfa0df938           2
3  0000a43bce5eec  4bf06994bd4c9a           2
4  0000a43bce5eec  4d1bbbc19b9084           2
          matchId         groupId  maxPlace  numGroups  winPlacePerc  \
0  a10357fd1a4a91  4d4b580de459be        28         26        0.4444   
1  aeb375fc57110c  684d5656442f9e        26         25        0.6400   
2  110163d8bb94ae  6a4a42c3245a74        50         47        0.7755   
3  f1f1f4ef412d7e  a930a9c79cd721        31         30        0.1667   
4  6dc8ff871e21e6  de04010b3458dd        97         95        0.1875   

   assists_mean  boosts_mean  damageDealt_mean  DBNOs_mean  \
0           0.0          0.0        102.187500         0.5   
1           0.5          1.5        142.817505         0.5   
2           0.5          1.5        107.300003         0.5   
3           0.0          0.0         32.900

In [1]:
feature_columns = list(df_full.columns)

print(feature_columns)
feature_columns.remove(target)
print(feature_columns)

train_data = df_full[feature_columns].values
train_labels = df_full[target].values

# Shuffling
order = np.argsort(np.random.random(train_labels.shape))
train_data = train_data[order]
train_labels = train_labels[order]

# Normalize Training Data
mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data = (train_data - mean) / std
train_data = np.nan_to_num(train_data)

NameError: ignored