In [None]:
from pyspark.sql import SparkSession
import numpy as np

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

import pyspark
from pyspark.ml import feature, regression, Pipeline
from pyspark.sql import functions as fn, Row
from pyspark import sql
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import LogisticRegression

import matplotlib.pyplot as plt
import pandas as pd


In [117]:
##############################################################################
##########       Spark creat and transform data             #################
##############################################################################
## Spark Method ##
Full_data = spark.read.csv('train_V2.csv', sep=',', inferSchema=True, header=True)
# seperate match type
update_fun = (fn.when(fn.col('matchType').contains('solo'), 'solo').when(fn.col('matchType').contains('duo' or 'crash'), 'duo')
                .otherwise('squad'))
Full_data = Full_data.withColumn('matchType', update_fun)
##############################################################################
###### collect solo match
New_df_solo = Full_data.filter(Full_data.matchType == 'solo')
columns = ['Id','groupId','matchId','matchType']
New_df_solo = New_df_solo.select([col for col in New_df_solo.columns if col not in columns])
pj_sp_df_solo = New_df_solo.sample(withReplacement=False, fraction=0.1, seed=3)
##############################################################################
###### collect team match
New_df_team  = Full_data.select('*').groupby('groupId').agg(fn.sum('damageDealt').alias('total_team_damage'),
                      fn.sum('kills').alias('total_team_kills'),
                      fn.sum('killPoints').alias('team_kill_points'),
                      fn.avg('killPlace').alias('team_kill_rank'),
                      fn.avg('rankPoints').alias('team_normal_rank'),
                      fn.sum('revives').alias('team_revives'),
                      fn.sum('boosts').alias('team_boosts'),
                      fn.sum('assists').alias('total_assists'),
                      fn.sum('DBNOs').alias('team_DBNOs'),
                      fn.sum(Full_data.rideDistance + Full_data.walkDistance + Full_data.swimDistance).alias('totalDistance'))
New_df_team  = New_df_team.join(Full_data,New_df_team .groupId == Full_data.groupId)



In [118]:
##
columns = ['Id','groupId','matchId', 'roadKills','numGroups','rideDistance','walkDistance','swimDistance','kills','killPints','killPlace','rankPoints','revives','boosts','assists','DBNOs']
New_df_team = New_df_team.select([col for col in New_df_team.columns if col not in columns])

################################################################################
pj_sp_df_team = New_df_team.sample(withReplacement=False, fraction=0.1, seed=3)
##
#withColumn('solo',(fn.col('matchType') == 'solo').cast('int')).\
pj_sp_df_team = pj_sp_df_team.\
                    withColumn('duo',(fn.col('matchType') == 'duo').cast('int')).\
                    withColumn('squad',(fn.col('matchType') == 'squad').cast('int'))
pj_sp_df_team = pj_sp_df_team.drop('matchType')

##############################################################################



In [150]:
##############################################################
#Logit
##############################################################
#split the data set into five categories in 
rankfun = fn.when(fn.col('winPlacePerc')>=0.8,1).\
    when(fn.col('winPlacePerc')>=0.6,2).\
    when(fn.col('winPlacePerc')>=0.4,3).\
    when(fn.col('winPlacePerc')>=0.2,4).\
    when(fn.col('winPlacePerc')>=0,5)

In [121]:
pj_sp_df_solo = pj_sp_df_solo.withColumn('rank',rankfun).drop('winPlacePerc')
pj_sp_df_team = pj_sp_df_team.withColumn('rank',rankfun).drop('winPlacePerc')

In [133]:
from pyspark.ml.classification import LogisticRegression
#####################################################################
#### Multinomial logistic regression Model
training_df, validation_df, testing_df = pj_sp_df_solo.randomSplit([0.6, 0.3, 0.1], seed=0)
def logitReg(reg,elastic,training=training_df, validate=validation_df ):
    inputcol = training_df.columns
    inputcol.remove('rank')
    logitReg = LogisticRegression().\
        setMaxIter(10).\
        setLabelCol('rank').\
        setFeaturesCol('scaledFeatures').\
        setRegParam(reg).\
        setElasticNetParam(elastic)

    pipe_log=Pipeline(stages=[
        feature.VectorAssembler(inputCols=inputcol, outputCol='features'),
        feature.StandardScaler(withMean=True, inputCol='features', outputCol='scaledFeatures'),
        logitReg
        ]).fit(training_df)
    logSummary = pipe_log.stages[-1].summary
    accuracy = logSummary.accuracy
    falsePositiveRate = logSummary.weightedFalsePositiveRate
    truePositiveRate = logSummary.weightedTruePositiveRate
    fMeasure = logSummary.weightedFMeasure()
    precision = logSummary.weightedPrecision
    recall = logSummary.weightedRecall
    return(print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)))


In [134]:
##############################################################
#Multinomial logistic regression for solo players
logitReg(0,0)

Accuracy: 0.7101302059285253
FPR: 0.07148595686560526
TPR: 0.7101302059285253
F-measure: 0.7093123366304922
Precision: 0.7090564520219402
Recall: 0.7101302059285253


In [135]:
logitReg(0.1,0)

Accuracy: 0.5847954566441962
FPR: 0.10630865907765012
TPR: 0.5847954566441962
F-measure: 0.5637513492301548
Precision: 0.5683321306081848
Recall: 0.5847954566441962


In [137]:
logitReg(0.1,0.5)

Accuracy: 0.5341444270015698
FPR: 0.1203301004949466
TPR: 0.5341444270015698
F-measure: 0.49393156272789857
Precision: 0.5197929283448036
Recall: 0.5341444270015698


In [139]:
logitReg(0.2,0)

Accuracy: 0.5527518699787607
FPR: 0.11595787169559929
TPR: 0.5527518699787607
F-measure: 0.5207064231584007
Precision: 0.5350630774086994
Recall: 0.5527518699787607


In [140]:
logitReg(0.2,0.5)

Accuracy: 0.4631083202511774
FPR: 0.14247544754878932
TPR: 0.46310832025117743
F-measure: 0.36554435373598715
Precision: 0.3403376800912541
Recall: 0.46310832025117743


In [141]:
logitReg(0.3,0)

Accuracy: 0.5326207406039338
FPR: 0.12223378482119703
TPR: 0.5326207406039338
F-measure: 0.4922864275803533
Precision: 0.5186116200689332
Recall: 0.5326207406039338


In [138]:
logitReg(0.3,0.5)

Accuracy: 0.459853172038046
FPR: 0.14667228255571435
TPR: 0.459853172038046
F-measure: 0.3392477276204374
Precision: 0.29084765995329287
Recall: 0.459853172038046


In [142]:
##############################################################
#Multinomial logistic regression for team player
training_df, validation_df, testing_df = pj_sp_df_team.randomSplit([0.6, 0.3, 0.1], seed=0)

In [143]:
logitReg(0,0)

Accuracy: 0.6121119547149467
FPR: 0.09578470859908182
TPR: 0.6121119547149467
F-measure: 0.6058559521843826
Precision: 0.6017630344012748
Recall: 0.6121119547149467


In [144]:
logitReg(0.1,0)

Accuracy: 0.5394933884421282
FPR: 0.12177519476988757
TPR: 0.5394933884421282
F-measure: 0.5055745983219475
Precision: 0.5050234520134471
Recall: 0.5394933884421282


In [145]:
logitReg(0.1,0.5)

Accuracy: 0.4922316066909265
FPR: 0.1398877932962714
TPR: 0.4922316066909266
F-measure: 0.40791933577845646
Precision: 0.4518839319135191
Recall: 0.4922316066909266


In [146]:
logitReg(0.2,0)

Accuracy: 0.5082326250056157
FPR: 0.13436617580083582
TPR: 0.5082326250056157
F-measure: 0.45714382430033074
Precision: 0.46611514562777995
Recall: 0.5082326250056157


In [147]:
logitReg(0.2,0.5)

Accuracy: 0.44944367072495023
FPR: 0.15997203041329314
TPR: 0.4494436707249502
F-measure: 0.3213524338339403
Precision: 0.26122577247813444
Recall: 0.4494436707249502


In [148]:
logitReg(0.3,0)

Accuracy: 0.4890793237192447
FPR: 0.14283811920418474
TPR: 0.4890793237192446
F-measure: 0.4233778030521245
Precision: 0.44253737407379384
Recall: 0.4890793237192446


In [149]:
logitReg(0.3,0.5)

Accuracy: 0.43306827200982373
FPR: 0.1714439436190311
TPR: 0.43306827200982373
F-measure: 0.2868102322504982
Precision: 0.24320377816828365
Recall: 0.43306827200982373
