[View in Colaboratory](https://colab.research.google.com/github/mondchopers/kaggle_PUBG/blob/master/Exploration.ipynb)

# PUBG Competition Data Exploration

## Loading Kaggle API and Download data to Colab Storage

In [0]:
# Install kaggle API, go to my Google Drive and get API key so that I can
# access dataset
# https://medium.com/@move37timm/d18645f93648

!pip install kaggle
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

# Optional piece of code: sometime just need to put the key in the right place
!mkdir ~/.kaggle
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json

In [0]:
# List current active competitions
!kaggle competitions list

# Download PUBG data
!kaggle competitions download -c pubg-finish-placement-prediction -p /content/kaggle

# List down all files from PUBG competition
import os
os.chdir('/content/kaggle')
!ls

# Unzip all files
for file in os.listdir():
  !unzip {file}
!ls

In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb

## Helper Functions

## Exploration

In [0]:
# Read training data, print out some information from there
trainDf = pd.read_csv('train.csv')
print(trainDf.head(3))
print(len(trainDf))
print(trainDf.dtypes)

In [0]:
# Going through groups and see how many players are inside each group
for i in np.unique(trainDf['groupId'].values):
  print(i, len(trainDf[trainDf['groupId'] == i]))
  if i > 10:
    break
   
print(trainDf[trainDf['groupId'] == 11])

## Using XGB for Regression

In [0]:
# Explicitly choose columns for X and Y axis
XCol = list(trainDf.columns)
XCol.remove('Id')
XCol.remove('groupId')
XCol.remove('matchId')
# Split into train and validation
trainLen = int(0.75 * len(trainDf))
seed = np.random.permutation(len(trainDf))

trainX = trainDf[XCol].iloc[seed[trainLen:]].reset_index(drop=True)
trainY = trainDf['winPlacePerc'].iloc[seed[trainLen:]].reset_index(drop=True)
validX = trainDf[XCol].iloc[seed[:trainLen]].reset_index(drop=True)
validY = trainDf['winPlacePerc'].iloc[seed[:trainLen]].reset_index(drop=True)

In [0]:
model = xgb.XGBRegressor(n_estimators=50, learning_rate=0.01, subsample=0.75,
                           colsample_bytree=1, max_depth=4)
model.fit(trainX, trainY)
print(model.score(validX, validY))

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

validZ = model.predict(validX)
print(validZ[0], validY.values[0])
plt.scatter(validY.values, validZ-validY.values)