<a href="https://colab.research.google.com/github/lukaszlewickii/spark-labs/blob/main/classification/classification_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=49c7af8cd78174e08c554943a3c3c745302f3270714edf807ad17009a10942b0
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, monotonically_increasing_id
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [3]:
spark = SparkSession.builder.appName("BTTS_Prediction").getOrCreate()

In [6]:
season_18_19_data = spark.read.csv('/content/england-premier-league-matches-2018-to-2019-stats.csv', header=True)

In [7]:
season_18_19_data.columns

['timestamp',
 'date_GMT',
 'status',
 'attendance',
 'home_team_name',
 'away_team_name',
 'referee',
 'Game Week',
 'Pre-Match PPG (Home)',
 'Pre-Match PPG (Away)',
 'home_ppg',
 'away_ppg',
 'home_team_goal_count',
 'away_team_goal_count',
 'total_goal_count',
 'total_goals_at_half_time',
 'home_team_goal_count_half_time',
 'away_team_goal_count_half_time',
 'home_team_goal_timings',
 'away_team_goal_timings',
 'home_team_corner_count',
 'away_team_corner_count',
 'home_team_yellow_cards',
 'home_team_red_cards',
 'away_team_yellow_cards',
 'away_team_red_cards',
 'home_team_first_half_cards',
 'home_team_second_half_cards',
 'away_team_first_half_cards',
 'away_team_second_half_cards',
 'home_team_shots',
 'away_team_shots',
 'home_team_shots_on_target',
 'away_team_shots_on_target',
 'home_team_shots_off_target',
 'away_team_shots_off_target',
 'home_team_fouls',
 'away_team_fouls',
 'home_team_possession',
 'away_team_possession',
 'Home Team Pre-Match xG',
 'Away Team Pre-Match 

In [15]:
#feature selection
selected_features = ['home_team_name', 'away_team_name', 'home_team_goal_count', 'away_team_goal_count',
                     'home_team_shots_on_target', 'away_team_shots_on_target',]

In [20]:
#adding btts column
df = season_18_19_data.select(selected_features).withColumn("btts", when((col("home_team_goal_count") > 0) & (col("away_team_goal_count") > 0), 1).otherwise(0))\
                                                .withColumn('match_id', monotonically_increasing_id())

In [21]:
df.show()

+--------------------+--------------------+--------------------+--------------------+-------------------------+-------------------------+----+--------+
|      home_team_name|      away_team_name|home_team_goal_count|away_team_goal_count|home_team_shots_on_target|away_team_shots_on_target|btts|match_id|
+--------------------+--------------------+--------------------+--------------------+-------------------------+-------------------------+----+--------+
|   Manchester United|      Leicester City|                   2|                   1|                        7|                        5|   1|       0|
|    Newcastle United|   Tottenham Hotspur|                   1|                   2|                        3|                        6|   1|       1|
|     AFC Bournemouth|        Cardiff City|                   2|                   0|                        5|                        2|   0|       2|
|              Fulham|      Crystal Palace|                   0|                   2|   

In [23]:
#converting dtypes
df = df.withColumn("home_team_goal_count", df["home_team_goal_count"].cast("int"))
df = df.withColumn("away_team_goal_count", df["away_team_goal_count"].cast("int"))
df = df.withColumn("home_team_shots_on_target", df["home_team_shots_on_target"].cast("int"))
df = df.withColumn("away_team_shots_on_target", df["away_team_shots_on_target"].cast("int"))

In [24]:
#feature vector
assembler = VectorAssembler(inputCols=['home_team_goal_count', 'away_team_goal_count',
                                       'home_team_shots_on_target', 'away_team_shots_on_target'], outputCol='btts_prediction')

In [26]:
df = assembler.transform(df)

In [27]:
(train_data, test_data) = df.randomSplit([0.8, 0.2], seed=42)

In [31]:
#models initialization
logistic_regression = LogisticRegression(labelCol="btts", featuresCol="btts_prediction", maxIter=10, regParam=0.3, elasticNetParam=0.8)
linear_svc = LinearSVC(labelCol="btts", featuresCol="btts_prediction", maxIter=10, regParam=0.1)
decision_tree = DecisionTreeClassifier(labelCol="btts", featuresCol="btts_prediction", maxDepth=5)
random_forest = RandomForestClassifier(labelCol="btts", featuresCol="btts_prediction", numTrees=10)

In [32]:
#pipelines
pipeline_lr = Pipeline(stages=[logistic_regression])
pipeline_svc = Pipeline(stages=[linear_svc])
pipeline_dt = Pipeline(stages=[decision_tree])
pipeline_rf = Pipeline(stages=[random_forest])

In [33]:
model_lr = pipeline_lr.fit(train_data)
model_svc = pipeline_svc.fit(train_data)
model_dt = pipeline_dt.fit(train_data)
model_rf = pipeline_rf.fit(train_data)

In [34]:
#predictions
predictions_lr = model_lr.transform(test_data)
predictions_svc = model_svc.transform(test_data)
predictions_dt = model_dt.transform(test_data)
predictions_rf = model_rf.transform(test_data)

In [35]:
#evaluation
evaluator = BinaryClassificationEvaluator(labelCol="btts")

accuracy_lr = evaluator.evaluate(predictions_lr)
accuracy_svc = evaluator.evaluate(predictions_svc)
accuracy_dt = evaluator.evaluate(predictions_dt)
accuracy_rf = evaluator.evaluate(predictions_rf)

#accuracy scores
print("Logistic Regression Accuracy: {:.2f}%".format(accuracy_lr * 100))
print("Linear SVC Accuracy: {:.2f}%".format(accuracy_svc * 100))
print("Decision Tree Accuracy: {:.2f}%".format(accuracy_dt * 100))
print("Random Forest Accuracy: {:.2f}%".format(accuracy_rf * 100))

#for notice: just a template, not necessary working atm :P

Logistic Regression Accuracy: 50.00%
Linear SVC Accuracy: 86.67%
Decision Tree Accuracy: 100.00%
Random Forest Accuracy: 100.00%
