In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
# machine learning libraries
from sklearn.ensemble import RandomForestRegressor

In [58]:
###Read in training data
pd.options.display.float_format = '{:,.2f}'.format
train = pd.read_csv('/Users/Matthew/Documents/Documents/Python/PUBG_kaggle-master/train_V2.csv')

In [59]:
###Clean Training data
train.columns = [c.replace(' ', '_') for c in train.columns]
train=train.dropna(axis=0)


In [60]:
###Add Desired Columns to training data
train['kills_per_meter'] = train['kills']/train['walkDistance']
train['heals_damage'] = train['heals']/train['damageDealt']
train['boosts_damage'] = train['boosts']/train['damageDealt']
train['DBNO-Kills'] = train['DBNOs']-train['kills']
train['kills*kill%'] = train['kills']*train['killPlace']
train['vehicle_use'] = 0
train.loc[(train['rideDistance'] > 0),'vehicle_use'] = 1
train['water_use'] = 0
train.loc[(train['swimDistance'] > 0),'water_use'] = 1
train['rotten_team'] = 0
train.loc[(train['teamKills'] > 0),'rotten_team'] = 1
train['weaponsbin']=pd.cut(train['weaponsAcquired'],5,labels=[1,2,3,4,5])
train['walkDistance2']=train['walkDistance']**2
vlist=['assists','damageDealt','headshotKills','killPlace','kills','roadKills','DBNO-Kills']
for v in vlist:
    train[v]=train[v]/train['maxPlace']
tlist=['kills_per_meter','heals_damage','boosts_damage']
for t in tlist:
    m = train.loc[train[t] != np.inf, t].mean()
    train[t].replace(np.inf,m,inplace=True)
    train[t].fillna(m,inplace=True)


In [61]:
###Get rid of unneccessary columns from train data
droplist=['Id', 'groupId', 'matchId', 'DBNOs', 'killStreaks','numGroups', 'rankPoints', 'rideDistance','swimDistance', 'teamKills', 'vehicleDestroys', 'weaponsAcquired']
for v in droplist:
    train=train.drop(v , axis=1)

In [62]:
###Define Data Clusters in train 
train['matchType2']=train['matchType'].str.replace('-','')
match_type=[]
for x in train['matchType2'].unique():
    match_type.append(x)

In [63]:
###Separate training Data
train_list=[]
for m in match_type:
    m=pd.DataFrame(train[train['matchType2']==m]).drop('matchType' , axis=1).drop('matchType2' , axis=1)
    train_list.append(m)

In [64]:
###Read in test data
pd.options.display.float_format = '{:,.2f}'.format
test = pd.read_csv('/Users/Matthew/Documents/Documents/Python/PUBG_kaggle-master/test_V2.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1934174 entries, 0 to 1934173
Data columns (total 28 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
dtypes: float64(5), int64(19), object(4)
memory usage: 413.2+ MB


In [65]:
###Clean test data
test.columns = [c.replace(' ', '_') for c in test.columns]
test=test.dropna(axis=0)
test.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints'],
      dtype='object')

In [66]:
###Add Desired Columns to test data
test['kills_per_meter'] = test['kills']/test['walkDistance']
test['heals_damage'] = test['heals']/test['damageDealt']
test['boosts_damage'] = test['boosts']/test['damageDealt']
test['DBNO-Kills'] = test['DBNOs']-test['kills']
test['kills*kill%'] = test['kills']*test['killPlace']
test['vehicle_use'] = 0
test.loc[(test['rideDistance'] > 0),'vehicle_use'] = 1
test['water_use'] = 0
test.loc[(test['swimDistance'] > 0),'water_use'] = 1
test['rotten_team'] = 0
test.loc[(test['teamKills'] > 0),'rotten_team'] = 1
test['weaponsbin']=pd.cut(test['weaponsAcquired'],5,labels=[1,2,3,4,5])
test['walkDistance2']=test['walkDistance']**2
vlist=['assists','damageDealt','headshotKills','killPlace','kills','roadKills','DBNO-Kills']
for v in vlist:
    test[v]=test[v]/test['maxPlace']
tlist=['kills_per_meter','heals_damage','boosts_damage']
for t in tlist:
    m = test.loc[test[t] != np.inf, t].mean()
    test[t].replace(np.inf,m,inplace=True)
    test[t].fillna(m,inplace=True)


In [67]:
###Get rid of unneccessary columns from test data
droplist=['Id', 'groupId', 'matchId', 'DBNOs', 'killStreaks','numGroups', 'rankPoints', 'rideDistance','swimDistance', 'teamKills', 'vehicleDestroys','weaponsAcquired']
for v in droplist:
    test=test.drop(v , axis=1)

In [68]:
###Define Data Clusters in test
test['matchType2']=test['matchType'].str.replace('-','')
match_type=[]
for x in test['matchType2'].unique():
    match_type.append(x)

In [69]:
###Separate test Data 
test_list=[]
for m in match_type:
    m=pd.DataFrame(test[test['matchType2']==m]).drop('matchType' , axis=1).drop('matchType2' , axis=1)
    test_list.append(m)

In [70]:
###Create random jungle (better than forest)
random_forest = RandomForestRegressor(n_estimators=5)
for t in range(16):  
    X_train = train_list[t].drop('winPlacePerc', axis=1)
    Y_train = train_list[t]['winPlacePerc']
    X_test = test_list[t]
    #Y_test = test_list[t]['winPlacePerc']
    random_forest.fit(X_train, Y_train)
    random_forest_train_acc = random_forest.score(X_train, Y_train)
    #random_forest_test_acc = random_forest.score(X_test, Y_test)
    print(match_type[t], ' random_forest training acuracy= ',random_forest_train_acc)
    feature_importances = pd.DataFrame(random_forest.feature_importances_, index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
    print(feature_importances.head(2), feature_importances.tail(2))
    #print('random_forest test accuracy= ',random_forest_test_acc)
print('done')

squadfpp  random_forest training acuracy=  0.97880835076054
               importance
walkDistance2        0.54
walkDistance         0.16             importance
roadKills         0.00
weaponsbin        0.00
duofpp  random_forest training acuracy=  0.983921900725244
               importance
walkDistance         0.40
walkDistance2        0.30             importance
roadKills         0.00
weaponsbin        0.00
squad  random_forest training acuracy=  0.9902763537627867
               importance
walkDistance2        0.44
walkDistance         0.33             importance
weaponsbin        0.00
revives           0.00
solofpp  random_forest training acuracy=  0.9754331830866874
               importance
walkDistance2        0.49
killPlace            0.21             importance
roadKills         0.00
weaponsbin        0.00
solo  random_forest training acuracy=  0.9865368883053511
               importance
walkDistance2        0.72
killPlace            0.17             importance
roadKills     

In [74]:
###generate prediction and spit to csv
predicted = pd.DataFrame(random_forest.predict(X_test), )
#predicted.to_csv('path to save to', sep='\t')
predicted.head()

Unnamed: 0,0
0,0.55
1,0.83
2,0.27
3,0.27
4,0.28


In [None]:
#X_train[~X_train.isin([np.nan, np.inf, -np.inf]).any(1)]