# Import packages

In [1]:
!pip install numpy # if necessary 

Collecting numpy
  Downloading numpy-1.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[K     |████████████████████████████████| 17.1 MB 7.6 MB/s eta 0:00:01     |███████████████████████████▍    | 14.6 MB 6.3 MB/s eta 0:00:01     |██████████████████████████████▎ | 16.2 MB 7.6 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.23.4


In [2]:
import warnings
warnings.filterwarnings('ignore')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# Connection to Spark instance

In [3]:
conf = pyspark.SparkConf().setMaster('spark://172.18.0.22:7077')
spark = SparkSession \
    .builder.config(conf=conf) \
    .appName("Python") \
    .getOrCreate()
print('Submitted application!')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/08 16:57:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Submitted application!


# Read csv files from container
Please download the six files "ts_data_block_$i$.csv", $i={1,...,6}$, from GoogleDrive and upload it to this container before.
You can find the file here https://drive.google.com/drive/folders/1yvU4RxPRLoy-KmEMYfy-Ibn0SjIPHD5_?usp=share_link

In [4]:
# Create an empty RDD
emp_RDD = spark.sparkContext.emptyRDD()
columns = StructType([StructField('datetime', DateType(), False),
                       StructField('pulse', FloatType(), False),
                       StructField('category', StringType(), False),
                       StructField('ts_number', StringType(), False)])

df = spark.createDataFrame(data=emp_RDD, schema=columns)

for i in range(1, 7):

    df_temp = spark.read.format("csv") \
        .option('header', True) \
        .option('multiLine', True) \
        .option('inferSchema', True) \
        .load(f"file:////data/ts_data_block_{i}.csv")
    
    print(f'Read data block number {i}.')
    
    df = df.union(df_temp)

print('All data blocks read and concatenated.')

                                                                                

Read data block number 1.


                                                                                

Read data block number 2.


                                                                                

Read data block number 3.


                                                                                

Read data block number 4.


                                                                                

Read data block number 5.


[Stage 11:>                                                         (0 + 1) / 1]

Read data block number 6.
All data blocks read and concatenated.


                                                                                

In [5]:
df.show(5)

[Stage 12:>                                                         (0 + 1) / 1]

+-------------------+-----+-----------+---------+
|           datetime|pulse|   category|ts_number|
+-------------------+-----+-----------+---------+
|2021-01-01 00:00:00| 80.0|non_athlete|      1_1|
|2021-01-02 00:00:00| 84.0|non_athlete|      1_1|
|2021-01-03 00:00:00| 79.0|non_athlete|      1_1|
|2021-01-04 00:00:00| 83.0|non_athlete|      1_1|
|2021-01-05 00:00:00| 78.0|non_athlete|      1_1|
+-------------------+-----+-----------+---------+
only showing top 5 rows



                                                                                

In [6]:
# print(f'Dataframe consists of {df.count()} rows.')

# Applying Machine Learning Classification Model on time series data.

## Feature Engineering

### Groupby time series data by the columns 'ts_number' and 'category' and apply descriptive statistic measures on the column 'pulse' (analogue to pandas' describe() method)

In [7]:
# Define function for calculation descriptive statistics.
def groupby_describe(df, groupby_col, stat_col):
    out = df.groupby(groupby_col).agg(
        F.mean(stat_col).alias("mean"),
        F.stddev(stat_col).alias("std"),
        F.min(stat_col).alias("min"),
        F.expr(f"percentile({stat_col}, array(0.25))")[0].alias("low_quart"),
        F.expr(f"percentile({stat_col}, array(0.5))")[0].alias("median"),
        F.expr(f"percentile({stat_col}, array(0.75))")[0].alias("up_quart"),
        F.max(stat_col).alias("max"),
    )
    return out

df_stats = groupby_describe(df, ['ts_number', 'category'], 'pulse')

In [8]:
df_stats.show(5)

[Stage 15:>                                                         (0 + 1) / 1]

+---------+-----------+-----------------+------------------+----+---------+------+--------+----+
|ts_number|   category|             mean|               std| min|low_quart|median|up_quart| max|
+---------+-----------+-----------------+------------------+----+---------+------+--------+----+
| 10000_11|    athlete|62.51111111111111|3.2987721394429537|54.0|     60.0|  62.5|    65.0|71.0|
|  10000_9|pro_athlete|52.56666666666667|2.4588752338681874|46.0|     51.0|  52.0|    54.0|59.0|
| 10001_31|    athlete|65.82222222222222| 6.180029049380432|51.0|     62.0|  66.5|    71.0|79.0|
|  10002_2|    athlete|63.48888888888889| 2.366642928614402|59.0|     62.0|  63.5|    65.0|70.0|
| 10002_38|    athlete|58.93333333333333| 2.021235577172282|54.0|     58.0|  59.0|    60.0|64.0|
+---------+-----------+-----------------+------------------+----+---------+------+--------+----+
only showing top 5 rows



                                                                                

In [9]:
#print(f'Dataframe consists of {df_stats.count()} rows.')

### Encoding "category" column and store as new column 'Target'

In [10]:
catEncoder = StringIndexer(inputCol='category', outputCol='Target').fit(df_stats)
df_stats = catEncoder.transform(df_stats)

                                                                                

In [11]:
df_stats.show(5)

[Stage 24:>                                                         (0 + 1) / 1]

+---------+-----------+-----------------+------------------+----+---------+------+--------+----+------+
|ts_number|   category|             mean|               std| min|low_quart|median|up_quart| max|Target|
+---------+-----------+-----------------+------------------+----+---------+------+--------+----+------+
| 10000_11|    athlete|62.51111111111111|3.2987721394429537|54.0|     60.0|  62.5|    65.0|71.0|   0.0|
|  10000_9|pro_athlete|52.56666666666667|2.4588752338681874|46.0|     51.0|  52.0|    54.0|59.0|   2.0|
| 10001_31|    athlete|65.82222222222222| 6.180029049380432|51.0|     62.0|  66.5|    71.0|79.0|   0.0|
|  10002_2|    athlete|63.48888888888889| 2.366642928614402|59.0|     62.0|  63.5|    65.0|70.0|   0.0|
| 10002_38|    athlete|58.93333333333333| 2.021235577172282|54.0|     58.0|  59.0|    60.0|64.0|   0.0|
+---------+-----------+-----------------+------------------+----+---------+------+--------+----+------+
only showing top 5 rows



                                                                                

### Transform features to a vector

In [12]:
print("Transform features to vector and store as 'features':")
required_features = ['mean', 'std', 'min', 'low_quart', 'median', 'up_quart', 'max']

vec_assembler = VectorAssembler(inputCols=required_features, outputCol='features')
df_stats_vec = vec_assembler.transform(df_stats)

Transform features to vector and store as 'features':


In [13]:
df_stats_vec.show(5)

[Stage 27:>                                                         (0 + 1) / 1]

+---------+-----------+-----------------+------------------+----+---------+------+--------+----+------+--------------------+
|ts_number|   category|             mean|               std| min|low_quart|median|up_quart| max|Target|            features|
+---------+-----------+-----------------+------------------+----+---------+------+--------+----+------+--------------------+
| 10000_11|    athlete|62.51111111111111|3.2987721394429537|54.0|     60.0|  62.5|    65.0|71.0|   0.0|[62.5111111111111...|
|  10000_9|pro_athlete|52.56666666666667|2.4588752338681874|46.0|     51.0|  52.0|    54.0|59.0|   2.0|[52.5666666666666...|
| 10001_31|    athlete|65.82222222222222| 6.180029049380432|51.0|     62.0|  66.5|    71.0|79.0|   0.0|[65.8222222222222...|
|  10002_2|    athlete|63.48888888888889| 2.366642928614402|59.0|     62.0|  63.5|    65.0|70.0|   0.0|[63.4888888888888...|
| 10002_38|    athlete|58.93333333333333| 2.021235577172282|54.0|     58.0|  59.0|    60.0|64.0|   0.0|[58.9333333333333...|


                                                                                

### Split data set into training (70%) and test data set (30%)

In [14]:
train_df, test_df = df_stats_vec.randomSplit([0.7, 0.3], seed=12345)

In [15]:
train_df.show(5)

[Stage 30:>                                                         (0 + 1) / 1]

+---------+-----------+-----------------+------------------+----+---------+------+--------+----+------+--------------------+
|ts_number|   category|             mean|               std| min|low_quart|median|up_quart| max|Target|            features|
+---------+-----------+-----------------+------------------+----+---------+------+--------+----+------+--------------------+
| 10000_11|    athlete|62.51111111111111|3.2987721394429537|54.0|     60.0|  62.5|    65.0|71.0|   0.0|[62.5111111111111...|
|  10000_9|pro_athlete|52.56666666666667|2.4588752338681874|46.0|     51.0|  52.0|    54.0|59.0|   2.0|[52.5666666666666...|
| 10001_31|    athlete|65.82222222222222| 6.180029049380432|51.0|     62.0|  66.5|    71.0|79.0|   0.0|[65.8222222222222...|
|  10002_2|    athlete|63.48888888888889| 2.366642928614402|59.0|     62.0|  63.5|    65.0|70.0|   0.0|[63.4888888888888...|
| 10002_38|    athlete|58.93333333333333| 2.021235577172282|54.0|     58.0|  59.0|    60.0|64.0|   0.0|[58.9333333333333...|


                                                                                

In [16]:
#print(f"Number of train dataset: {train_df.count()}")
#print(f"Number of test  dataset: {test_df.count()}")

## Apply Multinomial Logistic Regression as Classification Model

### Apply Logistic Regression model based on training data set and predict category on test data set

In [17]:
lr = LogisticRegression(featuresCol='features', labelCol='Target')

In [18]:
lr_model = lr.fit(train_df)



### Provide prediction based on  trained model.

In [19]:
y_pred = lr_model.transform(test_df)

### Show prediction vs. true values on test data set.

In [20]:
df_target_vs_prediction = y_pred.select('Target', 'prediction')

In [21]:
df_target_vs_prediction.show(10)

[Stage 375:>                                                        (0 + 1) / 1]

+------+----------+
|Target|prediction|
+------+----------+
|   1.0|       1.0|
|   1.0|       1.0|
|   2.0|       2.0|
|   2.0|       2.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   1.0|       1.0|
|   2.0|       2.0|
|   0.0|       0.0|
|   1.0|       1.0|
+------+----------+
only showing top 10 rows



                                                                                

## Model Evaluation

### Model evaluation measures

### Confusion matrix

In [22]:
metrics = MulticlassMetrics(df_target_vs_prediction.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())



[[135481.  10054.   3546.]
 [ 10317. 140633.      0.]
 [  5488.      0. 145935.]]


                                                                                

### Evaluation measures

In [23]:
multi_evaluator_acc = MulticlassClassificationEvaluator(labelCol='Target', metricName='accuracy')
print(f'Prediction Accuracy: {multi_evaluator_acc.evaluate(y_pred)}')
multi_evaluator_prec = MulticlassClassificationEvaluator(labelCol='Target', metricName='precisionByLabel')
print(f'Prediction Precision: {multi_evaluator_prec.evaluate(y_pred)}')
multi_evaluator_rec = MulticlassClassificationEvaluator(labelCol='Target', metricName='recallByLabel')
print(f'Prediction Recall: {multi_evaluator_rec.evaluate(y_pred)}')
multi_evaluator_f1 = MulticlassClassificationEvaluator(labelCol='Target', metricName='f1')
print(f'Prediction F1-Score: {multi_evaluator_f1.evaluate(y_pred)}')

                                                                                

Prediction Accuracy: 0.9348660107120548


                                                                                

Prediction Precision: 0.9087744246416377


                                                                                

Prediction Recall: 0.8955289980566609




Prediction F1-Score: 0.9347111814071491


                                                                                