In [None]:
import gc
import time
import random
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
def reload_train():
    gc.collect()
    df=pd.read_csv('../input/pubg-finish-placement-prediction/train_V2.csv')
    nan_matchId=df[df['winPlacePerc'].isna()]['matchId'].values[0]
    df=df[df['matchId']!=nan_matchId]
    return df

In [None]:
def reload_test():
    gc.collect()
    df=pd.read_csv('../input/pubg-finish-placement-prediction/test_V2.csv')
    return df

In [None]:
train=reload_train()

In [None]:
train.shape

In [None]:
train.columns

In [None]:
train.head()

# Kills

In [None]:
# 99% kills less than 7, max recorded is 72
print(train['kills'].quantile(0.99))
print(train['kills'].max())

In [None]:
sns.countplot(train['kills'].sort_values())
plt.show()

In [None]:
df=train.copy()
# greater than 7='8+'
df.loc[df['kills']>df['kills'].quantile(0.99)]='8+'

In [None]:
# most people have 0 kill 
sns.countplot(df['kills'].astype('str').sort_values())
plt.show()

In [None]:
# people with 0 kill might have caused damage
df=train[train['kills']==0]
plt.hist(df['damageDealt'])
plt.show()

In [None]:
# %age of people who won with 0 kills
x=(len(df[df['winPlacePerc']==1])/len(train))*100
print(x)

In [None]:
kills=train.copy()
kills['killsCategories'] = pd.cut(kills['kills'], [-1, 0, 2, 5, 10, 60], labels=['0_kills','1-2_kills', '3-5_kills', '6-10_kills', '10+_kills'])

plt.figure(figsize=(15,8))
sns.boxplot(x="killsCategories", y="winPlacePerc", data=kills)
plt.show()

In [None]:
df['kills']=df['kills'].astype('str')
sns.boxplot(x="kills",y="winPlacePerc", data=df)

## Runners

In [None]:
df=train.copy()
print(df['walkDistance'].quantile(0.99))

In [None]:
df=df[df['walkDistance']<df['walkDistance'].quantile(0.99)]
plt.figure(figsize=(15,10))
sns.distplot(df['walkDistance'])
plt.show()

In [None]:
plt.scatter(train['walkDistance'],train['winPlacePerc'])
plt.plot()

## Drivers

In [None]:
df=train.copy()
print(df['rideDistance'].quantile(0.9))

In [None]:
df=df[df['rideDistance']<df['rideDistance'].quantile(0.9)]
plt.figure(figsize=(15,10))
sns.distplot(df['rideDistance'])
plt.show()

In [None]:
## people with 0 ride distance ## 74% dont ride
print(len(df[df['rideDistance'] == 0]))
print(len(df[df['rideDistance']==0])/len(train))

In [None]:
## have a correlation
plt.scatter(train["winPlacePerc"], train["rideDistance"])
plt.show()

## Vehicle destroy

In [None]:
## vehicle destroy means player has skills
## destroying a single vehicle increases chances of winning
plt.figure(figsize =(15,8))
sns.pointplot(x='vehicleDestroys',y='winPlacePerc',data=train)
plt.grid()
plt.show()

## Swimmers

In [None]:
df=train.copy()
print(df['swimDistance'].quantile(0.99))

In [None]:
## almost noone swims
df=df[df['swimDistance']<df['swimDistance'].quantile(0.99)]
plt.figure(figsize=(15,10))
sns.distplot(df['swimDistance'])
plt.show()

In [None]:
## there are 3 maps in which one has no water
df = train.copy()
df['swimDistance'] = pd.cut(df['swimDistance'], [-1, 0, 5, 20, 5286], labels=['0m','1-5m', '6-20m', '20m+'])
plt.figure(figsize=(15,8))
sns.boxplot(x="swimDistance", y="winPlacePerc", data=df)
plt.show()

## Healers

In [None]:
df=train.copy()
print(df['heals'].quantile(0.99))
print(df['boosts'].quantile(0.99))
print(df['heals'].max())
print(df['boosts'].max())

In [None]:
## has great effect
df = df[df['heals'] < df['heals'].quantile(0.99)]
df = df[df['boosts'] < df['boosts'].quantile(0.99)]
plt.figure(figsize =(12,8))
sns.pointplot(x='heals',y='winPlacePerc',data=df,color='red')
sns.pointplot(x='boosts',y='winPlacePerc',data=df)
plt.xlabel('Number of heal/boost items')
plt.ylabel('Win Percentage')
plt.grid()
plt.show()

In [None]:
plt.scatter(train["winPlacePerc"], train["heals"])
plt.show()

In [None]:
plt.scatter(train["winPlacePerc"], train["boosts"])
plt.show()

## Match type => solos,duos and squads

In [None]:
solos = train[train['numGroups']>50]
duos = train[(train['numGroups']>25) & (train['numGroups']<=50)]
squads = train[train['numGroups']<=25]
print(100*len(solos)/len(train))
print(100*len(duos)/len(train))
print(100*len(squads)/len(train))

In [None]:
## in squads, kills doesn't matters
plt.figure(figsize =(20,10))
sns.pointplot(x='kills',y='winPlacePerc',data=solos,color='red')
sns.pointplot(x='kills',y='winPlacePerc',data=duos,color='black')
sns.pointplot(x='kills',y='winPlacePerc',data=squads)
plt.xlabel('Number of kills')
plt.ylabel('Win Percentage')
plt.grid()
plt.show()

In [None]:
## DBNOs means knocks which can be revived by team mates, therefore knocks are not there in solos
## similarly assist(involvement of teammate in killing) can't happen in solos 

plt.figure(figsize =(20,10))
sns.pointplot(x='DBNOs',y='winPlacePerc',data=duos,color='red')
sns.pointplot(x='DBNOs',y='winPlacePerc',data=squads,color='blue')
sns.pointplot(x='assists',y='winPlacePerc',data=duos,color='orange')
sns.pointplot(x='assists',y='winPlacePerc',data=squads,color='black')
sns.pointplot(x='revives',y='winPlacePerc',data=duos,color='pink')
sns.pointplot(x='revives',y='winPlacePerc',data=squads,color='brown')
plt.xlabel('Number of DBNOs/Assits/Revives')
plt.ylabel('Win Percentage')
plt.grid()
plt.show()

## Correlation between features

In [None]:
## highest positive correlation is walkDistance and the highest negative the killPlace
cm=train.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(cm, annot=True, linewidths=.5, fmt= '.1f')
plt.show()

In [None]:
## top 5 most postive correlated
k = 6 
plt.figure(figsize=(11, 11))
cols = train.corr().nlargest(k, 'winPlacePerc')['winPlacePerc'].index
pcm = np.corrcoef(train[cols].values.T)
sns.heatmap(pcm, cbar=True, annot=True, square=True, fmt='.2f',yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
cols = ['winPlacePerc', 'walkDistance', 'boosts', 'weaponsAcquired', 'damageDealt', 'killPlace']
sns.pairplot(train[cols], size = 2.5)
plt.show()



## Feature engineering

In [None]:
## all 100 players don't actually play
train['playersJoined'] = train.groupby('matchId')['matchId'].transform('count')
train.head()

In [None]:
## When there are 100 players in the game it might be easier to find and kill someone, than when there are 90 players. 
## Therefore, needs normalisation
## kill in 100 players will score 1 and in 90 players it will score (100-90)/100 + 1 = 1.1

In [None]:
norm = (100-train['playersJoined'])/100 + 1
train['killsNorm'] = train['kills'] * norm
train['damageDealtNorm'] = train['damageDealt'] * norm
train[['playersJoined', 'kills', 'killsNorm', 'damageDealt', 'damageDealtNorm']][5:8]

In [None]:
train['healsAndBoosts'] = train['heals']+train['boosts']
train['totalDistance'] = train['walkDistance']+train['rideDistance']+train['swimDistance']

In [None]:
## boosts make run faster
## heals dont makes run faster but helps to stay out of zone and loot more

In [None]:
train['boostsPerWalkDistance'] = train['boosts']/(train['walkDistance']+1)
train['boostsPerWalkDistance'].fillna(0, inplace=True)
train['healsPerWalkDistance'] = train['heals']/(train['walkDistance']+1) 
train['healsPerWalkDistance'].fillna(0, inplace=True)
train['healsAndBoostsPerWalkDistance'] = train['healsAndBoosts']/(train['walkDistance']+1) 
train['healsAndBoostsPerWalkDistance'].fillna(0, inplace=True)
train[['walkDistance', 'boosts', 'boostsPerWalkDistance' ,'heals',  'healsPerWalkDistance', 'healsAndBoosts', 'healsAndBoostsPerWalkDistance']][40:45]

In [None]:
train['killsPerWalkDistance'] = train['kills']/(train['walkDistance']+1)
train['killsPerWalkDistance'].fillna(0, inplace=True)
train[['kills', 'walkDistance', 'rideDistance', 'killsPerWalkDistance', 'winPlacePerc']].sort_values(by='killsPerWalkDistance').tail(10)

In [None]:
## STRANGE
## heals>0 but walkdistance=0
## boosts>0 but walkdistance=0
## kills>0 but walkdistance=0

In [None]:
## CHEATERS
## 0 walkdistance, many kills, mostly win percent=1

In [None]:
train['team'] = [1 if i>50 else 2 if (i>25 & i<=50) else 4 for i in train['numGroups']]

In [None]:
train.shape   ## 10 features created

## Effective feature engineering

In [None]:
train=reload_train()

In [None]:
df=train.copy()
cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType']
cols_to_fit = [ col for col in df.columns if col not in cols_to_drop ]

## Correlation

In [None]:
corr = df[cols_to_fit].corr()
plt.figure(figsize=(30,20))
sns.heatmap( corr,annot=True,square=True,xticklabels=corr.columns.values,yticklabels=corr.columns.values )
plt.show()

In [None]:
agg = df.groupby(['groupId']).size().to_frame('players_joined')
df = df.merge(agg,how='left',on='groupId')
df.head()

In [None]:
df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
df['headshotKills_over_kills'].fillna(0, inplace=True)
df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
df['killPlace_over_maxPlace'].fillna(0, inplace=True)
df['killPlace_over_maxPlace'].replace(np.inf, 0, inplace=True)

In [None]:
corr = df[['killPlace', 'walkDistance', 'players_joined', 'headshotKills_over_kills', 'killPlace_over_maxPlace', 'winPlacePerc']].corr()
plt.figure(figsize=(12,8))
sns.heatmap( corr,annot=True,square=True,xticklabels=corr.columns.values,yticklabels=corr.columns.values )
plt.show()

## Score gain

In [None]:
def train_test_split(df, test_size=0.1):
    match_ids = df['matchId'].unique().tolist()
    train_size = int(len(match_ids) * (1 - test_size))
    train_match_ids = random.sample(match_ids, train_size)

    train = df[df['matchId'].isin(train_match_ids)]
    test = df[-df['matchId'].isin(train_match_ids)]
    
    return train, test

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

In [None]:
def run_experiment(preprocess):
    df=reload_train()
    df.drop(columns=['matchType'],inplace=True)
    
    df=preprocess(df)
    
    target = 'winPlacePerc'
    cols_to_drop = ['Id', 'groupId', 'matchId', target]
    cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
    
    train, val = train_test_split(df, 0.1)
    
    model=LinearRegression()
    model.fit(train[cols_to_fit],train[target])
    
    y_true = val[target]
    y_pred = model.predict(val[cols_to_fit])
    
    err = mean_absolute_error(y_true,y_pred)
    
    return err    

In [None]:
def run_experiments(preprocesses):
    res=[]
    for preprocess in preprocesses:
        start = time.time()
        score = run_experiment(preprocess)
        end = time.time()
        
        exec_time = end-start
        
        res.append({'name': preprocess.__name__,'score': score,'execution time': f'{round(exec_time, 2)}s'})
        gc.collect()
        
    return pd.DataFrame(res, columns=['name', 'score', 'execution time']).sort_values(by='score')

In [None]:
def original(df):
    return df

def items(df):
    df['items'] = df['heals'] + df['boosts']
    return df

def players_in_team(df):
    agg = df.groupby(['groupId']).size().to_frame('players_in_team')
    return df.merge(agg, how='left', on=['groupId'])

def norm_kills(df):
    df = players_in_team(df)
    norm = (100-df['players_in_team'])/100 + 1
    df['killsNorm'] = df['kills'] * norm
    return df
    
def norm_damage(df):
    df = players_in_team(df)
    norm = (100-df['players_in_team'])/100 + 1
    df['damageDealtNorm'] = df['damageDealt'] * norm
    return df

def total_distance(df):
    df['total_distance'] = df['rideDistance'] + df['swimDistance'] + df['walkDistance']
    return df

def headshotKills_over_kills(df):
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['headshotKills_over_kills'].fillna(0, inplace=True)
    return df

def killPlace_over_maxPlace(df):
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['killPlace_over_maxPlace'].fillna(0, inplace=True)
    df['killPlace_over_maxPlace'].replace(np.inf, 0, inplace=True)
    return df

def walkDistance_over_heals(df):
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_heals'].fillna(0, inplace=True)
    df['walkDistance_over_heals'].replace(np.inf, 0, inplace=True)
    return df

def walkDistance_over_boosts(df):
    df['walkDistance_over_boosts'] = df['walkDistance'] / df['boosts']
    df['walkDistance_over_boosts'].fillna(0, inplace=True)
    df['walkDistance_over_boosts'].replace(np.inf, 0, inplace=True)
    return df

def walkDistance_over_kills(df):
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['walkDistance_over_kills'].fillna(0, inplace=True)
    df['walkDistance_over_kills'].replace(np.inf, 0, inplace=True)
    return df

def teamwork(df):
    df['teamwork'] = df['assists'] + df['revives']
    return df

In [None]:
run_experiments([
    original,
    items,
    players_in_team,
    norm_kills,
    norm_damage,
    total_distance,
    headshotKills_over_kills,
    killPlace_over_maxPlace,
    walkDistance_over_heals,
    walkDistance_over_boosts,
    walkDistance_over_kills,
    teamwork
])

In [None]:
def min_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId','groupId'])[features].min()
    return df.merge(agg, suffixes=['', '_min'], how='left', on=['matchId', 'groupId'])

def max_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].max()
    return df.merge(agg, suffixes=['', '_max'], how='left', on=['matchId', 'groupId'])

def sum_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].sum()
    return df.merge(agg, suffixes=['', '_sum'], how='left', on=['matchId', 'groupId'])

def median_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].median()
    return df.merge(agg, suffixes=['', '_median'], how='left', on=['matchId', 'groupId'])

def mean_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    return df.merge(agg, suffixes=['', '_mean'], how='left', on=['matchId', 'groupId'])

def rank_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    agg = agg.groupby('matchId')[features].rank(pct=True)
    return df.merge(agg, suffixes=['', '_mean_rank'], how='left', on=['matchId', 'groupId'])

In [None]:
run_experiments([
    original,
    min_by_team,
    max_by_team,
    sum_by_team,
    median_by_team,
    mean_by_team,
    rank_by_team
])

## Feature importance of tree model

In [None]:
from lightgbm import LGBMRegressor

In [None]:
df = reload_train()
df.drop(columns=['matchType'], inplace=True)

target = 'winPlacePerc'
cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
train, val = train_test_split(df, 0.1)

In [None]:
params = {
    'n_estimators': 200,
    'learning_rate': 0.3, 
    'num_leaves': 30,
    'objective': 'regression_l2', 
    'metric': 'mae',
    'verbose': -1,
}

In [None]:
model = LGBMRegressor(**params)
model.fit(
    train[cols_to_fit], train[target],
    eval_set=[(val[cols_to_fit], val[target])],
    eval_metric='mae',
    verbose=20,
)

feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_, cols_to_fit)), columns=['Value','Feature'])

In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(x="Value", y="Feature", data=feature_importance.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.show()

## Permutation importance

In [None]:
## removing each feature and see how much score changes
import eli5
from eli5.sklearn import PermutationImportance

In [None]:
perm = PermutationImportance(model, random_state=42)
perm.fit(val[cols_to_fit], val[target])
eli5.show_weights(perm, feature_names=list(cols_to_fit))

In [None]:
### permutation importance for effective eng