# PySpark - Tarea 2

### Practice 1: Task 2: We will work with the Heterogeneity Dataset for Human Activity Recognition (HHAR), which contains motion sensor data from smartphones and smartwatches.

In [22]:
#pip install pyspark
#pip install findspark

In [23]:
import pyspark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import*
from pyspark.sql.types import StructField
from pyspark.sql.types import StructType
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F

In [24]:
# Create SparkSession
spark = SparkSession.builder.appName("Merve_Homework").getOrCreate()

## Statistics of Each File

Create a DataFrame for each provided file, defining the schema.

In [25]:
# Determine the schema of the data
schema = StructType([StructField("column1", IntegerType(), True), 
          StructField("column2", LongType(), True), 
          StructField("column3", LongType(), True), 
          StructField("x", DoubleType(), True), 
          StructField("y", DoubleType(), True), 
          StructField("z", DoubleType(), True), 
          StructField("usuario", StringType(), True),
          StructField("modelo", StringType(), True), 
          StructField("column9", StringType(), True), 
          StructField("clase", StringType(), True)])

In [26]:
# Read Phones_accelerometer File into a Spark DataFrame using the defined schema
df = spark.read.csv("small_data/Phones_accelerometer.csv",
                    sep = ",", header = False, schema = schema, nullValue="null")

In [27]:
df.printSchema()

root
 |-- column1: integer (nullable = true)
 |-- column2: long (nullable = true)
 |-- column3: long (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)
 |-- usuario: string (nullable = true)
 |-- modelo: string (nullable = true)
 |-- column9: string (nullable = true)
 |-- clase: string (nullable = true)



In [28]:
df.show()

+-------+-------------+-------------------+-------------------+------------------+-----------------+-------+------+--------+-----+
|column1|      column2|            column3|                  x|                 y|                z|usuario|modelo| column9|clase|
+-------+-------------+-------------------+-------------------+------------------+-----------------+-------+------+--------+-----+
|      0|1424696633908|1424696631913248572|          -5.958191|         0.6880646|         8.135345|      a|nexus4|nexus4_1|stand|
|      1|1424696633909|1424696631918283972|           -5.95224|         0.6702118|         8.136536|      a|nexus4|nexus4_1|stand|
|      2|1424696633918|1424696631923288855|         -5.9950867|0.6535491999999999|         8.204376|      a|nexus4|nexus4_1|stand|
|      3|1424696633919|1424696631928385290|         -5.9427185|0.6761626999999999|         8.128204|      a|nexus4|nexus4_1|stand|
|      4|1424696633929|1424696631933420691| -5.991516000000001|        0.64164734| 

Generate one record per user, model, and movement class, containing:

- Mean

- Standard deviation

- Maximum and minimum values of the motion sequence.

In [29]:
# Create a function to calculate the mean, standard deviation, minimum, and maximum values
def calc_statistic(df, column_names):
    """
    This function calculates the mean, standard deviation, minimum, and maximum values of the given columns.
    The function shows the selected columns and the calculated values.
    :param df: The DataFrame (Spark DataFrame) to be used. 
    :param column_names: The columns (in List form) to be used.
    :return: The DataFrame with the calculated values.
    """

    # Select the columns from the DataFrame
    df = df.select(column_names)

    # Show the DataFrame
    print("The selected columns are: ")
    df.show()

    # Calculate the mean value using the groupBy for last 3 element of the column_names
    # and mean, std, min and max in Functions library of Spark by rounding the result to 3 decimal places
    df = df.groupBy(column_names[-3:]).agg(F.round(F.mean("x"), 3).alias("mean_x"),
                                                        F.round(F.mean("y"), 3).alias("mean_y"), 
                                                        F.round(F.mean("z"), 3).alias("mean_z"),
                                                        F.round(F.stddev("x"), 3).alias("stddev_x"),
                                                        F.round(F.stddev("y"), 3).alias("stddev_y"), 
                                                        F.round(F.stddev("z"), 3).alias("stddev_z"),
                                                        F.round(F.min("x"), 3).alias("min_x"),
                                                        F.round(F.min("y"), 3).alias("min_y"), 
                                                        F.round(F.min("z"), 3).alias("min_z"),
                                                        F.round(F.max("x"), 3).alias("max_x"),
                                                        F.round(F.max("y"), 3).alias("max_y"), 
                                                        F.round(F.max("z"), 3).alias("max_z"))

    # Show the DataFrame putting a space between previous and current DataFrame
    print("The statistics of the selected columns are: ")
    df.show()

    return df

In [30]:
print("Phones_accelerometer File:")
calc_statistic(df, ["x", "y", "z", "usuario", "modelo", "clase"])

Phones_accelerometer File:
The selected columns are: 
+-------------------+------------------+-----------------+-------+------+-----+
|                  x|                 y|                z|usuario|modelo|clase|
+-------------------+------------------+-----------------+-------+------+-----+
|          -5.958191|         0.6880646|         8.135345|      a|nexus4|stand|
|           -5.95224|         0.6702118|         8.136536|      a|nexus4|stand|
|         -5.9950867|0.6535491999999999|         8.204376|      a|nexus4|stand|
|         -5.9427185|0.6761626999999999|         8.128204|      a|nexus4|stand|
| -5.991516000000001|        0.64164734|         8.135345|      a|nexus4|stand|
|          -5.965332|         0.6297455|         8.128204|      a|nexus4|stand|
| -5.991516000000001|0.6356963999999999|          8.16272|      a|nexus4|stand|
|          -5.915344|        0.63093567|         8.105591|      a|nexus4|stand|
|          -5.984375|         0.6940155|         8.067505|      a|

DataFrame[usuario: string, modelo: string, clase: string, mean_x: double, mean_y: double, mean_z: double, stddev_x: double, stddev_y: double, stddev_z: double, min_x: double, min_y: double, min_z: double, max_x: double, max_y: double, max_z: double]

In [31]:
# Read Phones_gyroscope File into a Spark DataFrame using the defined schema
# Show the statistics of the selected columns from the DataFrame
df2 = spark.read.csv("small_data/Phones_gyroscope.csv",
                    sep = ",", header = False, schema = schema, nullValue="null")
print("Phones_gyroscope File:")
calc_statistic(df2, ["x", "y", "z", "usuario", "modelo", "clase"])

Phones_gyroscope File:
The selected columns are: 


+--------------------+--------------------+--------------------+-------+------+-----+
|                   x|                   y|                   z|usuario|modelo|clase|
+--------------------+--------------------+--------------------+-------+------+-----+
|         0.013748169|-6.25610350000000...|        -0.023376465|      a|nexus4|stand|
|0.014816283999999999|       -0.0016937256|         -0.02230835|      a|nexus4|stand|
|           0.0158844|       -0.0016937256|        -0.021240234|      a|nexus4|stand|
|         0.016952515|        -0.003829956|         -0.02017212|      a|nexus4|stand|
|           0.0158844|-0.00703430180000...|         -0.02017212|      a|nexus4|stand|
|0.014816283999999999|        -0.009170532|         -0.02230835|      a|nexus4|stand|
|        0.0116119385|        -0.013442993|         -0.02230835|      a|nexus4|stand|
|        0.0116119385|        -0.014511108|        -0.021240234|      a|nexus4|stand|
|0.009475708000000001|        -0.016647339|         -0

DataFrame[usuario: string, modelo: string, clase: string, mean_x: double, mean_y: double, mean_z: double, stddev_x: double, stddev_y: double, stddev_z: double, min_x: double, min_y: double, min_z: double, max_x: double, max_y: double, max_z: double]

In [32]:
# Read Watch_accelerometer File into a Spark DataFrame using the defined schema
# Show the statistics of the selected columns from the DataFrame
df3 = spark.read.csv("small_data/Watch_accelerometer.csv",
                    sep = ",", header = False, schema = schema, nullValue="null")
print("Watch_accelerometer File:")
calc_statistic(df3, ["x", "y", "z", "usuario", "modelo", "clase"])

Watch_accelerometer File:
The selected columns are: 
+-----------+----------+-----------+-------+------+-----+
|          x|         y|          z|usuario|modelo|clase|
+-----------+----------+-----------+-------+------+-----+
| -0.5650316| -9.572019|-0.61411273|      a|  gear|stand|
|-0.83258367| -9.713276|-0.60693014|      a|  gear|stand|
| -1.0181342| -9.935339|-0.54408234|      a|  gear|stand|
| -1.2228385|-10.142437| -0.5662287|      a|  gear|stand|
| -1.5771804|-10.480618|-0.40282443|      a|  gear|stand|
| -2.1643584|-10.920552|-0.18375498|      a|  gear|stand|
|     -2.973|-11.063007| 0.21188685|      a|  gear|stand|
| -3.8881836| -11.08276|  0.6847417|      a|  gear|stand|
| -4.8919525|-10.890625|    1.01574|      a|  gear|stand|
| -12.600683| -7.674015| -1.1791444|      a|  gear|stand|
|  -9.214086|-4.5567646|  0.2172738|      a|  gear|stand|
|  -9.214086|-4.5567646|  0.2172738|      a|  gear|stand|
|  -9.240421| -4.104859| 0.22325931|      a|  gear|stand|
|  -9.273342|-3.729

DataFrame[usuario: string, modelo: string, clase: string, mean_x: double, mean_y: double, mean_z: double, stddev_x: double, stddev_y: double, stddev_z: double, min_x: double, min_y: double, min_z: double, max_x: double, max_y: double, max_z: double]

In [33]:
# Read Watch_gyroscope File into a Spark DataFrame using the defined schema
# Show the statistics of the selected columns from the DataFrame
df4 = spark.read.csv("small_data/Watch_gyroscope.csv",
                    sep = ",", header = False, schema = schema, nullValue="null")
print("Watch_gyroscope File:")
calc_statistic(df4, ["x", "y", "z", "usuario", "modelo", "clase"])

Watch_gyroscope File:
The selected columns are: 
+------------+-------------+------------+-------+------+-----+
|           x|            y|           z|usuario|modelo|clase|
+------------+-------------+------------+-------+------+-----+
| -0.16218652| -0.022104237|  0.05965481|      a|  gear|stand|
| -0.18322548|  -0.06178534| 0.012516857|      a|  gear|stand|
| -0.18082865|  -0.10865697|-0.036485307|      a|  gear|stand|
| -0.14780544|  -0.15792546| -0.09853696|      a|  gear|stand|
|  0.18216023|  -0.32357407| -0.27723506|      a|  gear|stand|
| 0.075101145|  -0.25566345|   -0.216515|      a|  gear|stand|
|  0.24660872|  -0.44554687| -0.31798145|      a|  gear|stand|
| -0.14780544|  -0.15792546| -0.09853696|      a|  gear|stand|
|  -0.0561927|   -0.2082592|  -0.1587244|      a|  gear|stand|
|  -2.0383835|  -0.39468047|   1.1475562|      a|  gear|stand|
|   0.7145261|    -0.536627|  -1.2319783|      a|  gear|stand|
|   0.7145261|    -0.536627|  -1.2319783|      a|  gear|stand|
|  0.8

DataFrame[usuario: string, modelo: string, clase: string, mean_x: double, mean_y: double, mean_z: double, stddev_x: double, stddev_y: double, stddev_z: double, min_x: double, min_y: double, min_z: double, max_x: double, max_y: double, max_z: double]

#### Concatenate DataFrames with join and union

In [34]:
# Concatenate df and df2 DataFrames with join
selected_columns = ["x", "y", "z", "usuario", "modelo", "clase"]
df_phones_concat = df.select(selected_columns).join(df2.select(selected_columns), on = ["usuario", "modelo", "clase"], how = "inner")
# Change the column names of the concatenated DataFrame
df_phones_concat = df_phones_concat.toDF("usuario", "modelo", "clase", "x_accelerometer", "y_accelerometer", "z_accelerometer", "x_gyroscope", "y_gyroscope", "z_gyroscope")
print("Concatenated Phones_accelerometer and Phones_gyroscope DataFrames: ")
df_phones_concat.show()

Concatenated Phones_accelerometer and Phones_gyroscope DataFrames: 
+-------+------+-----+------------------+------------------+-----------------+-----------+--------------------+------------+
|usuario|modelo|clase|   x_accelerometer|   y_accelerometer|  z_accelerometer|x_gyroscope|         y_gyroscope| z_gyroscope|
+-------+------+-----+------------------+------------------+-----------------+-----------+--------------------+------------+
|      a|nexus4|stand|        -6.0462646|         1.0082245|7.952057000000001|0.013748169|-6.25610350000000...|-0.023376465|
|      a|nexus4|stand|        -6.0272217|        0.99394226|7.929443400000001|0.013748169|-6.25610350000000...|-0.023376465|
|      a|nexus4|stand|        -6.0593567|0.9915619000000001|7.967529300000001|0.013748169|-6.25610350000000...|-0.023376465|
|      a|nexus4|stand|         -6.034363|0.9832306000000001|7.960388000000001|0.013748169|-6.25610350000000...|-0.023376465|
|      a|nexus4|stand|         -6.085541|1.00703430000000

In [35]:
# Concatenate df3 and df4 DataFrames with join
df_watch_concat = df3.select(selected_columns).join(df4.select(selected_columns), on = ["usuario", "modelo", "clase"], how = "inner")
# Change the column names of the concatenated DataFrame
df_watch_concat = df_watch_concat.toDF("usuario", "modelo", "clase", "x_accelerometer", "y_accelerometer", "z_accelerometer", "x_gyroscope", "y_gyroscope", "z_gyroscope")
print("Concatenated Watch_accelerometer and Watch_gyroscope DataFrames: ")
df_watch_concat.show()

Concatenated Watch_accelerometer and Watch_gyroscope DataFrames: 
+-------+------+-----+---------------+---------------+---------------+-----------+------------+-----------+
|usuario|modelo|clase|x_accelerometer|y_accelerometer|z_accelerometer|x_gyroscope| y_gyroscope|z_gyroscope|
+-------+------+-----+---------------+---------------+---------------+-----------+------------+-----------+
|      a|  gear|stand|      -9.197326|      -3.554791|     -1.0360907|-0.16218652|-0.022104237| 0.05965481|
|      a|  gear|stand|       -9.20391|     -3.5541923|      -1.055843|-0.16218652|-0.022104237| 0.05965481|
|      a|  gear|stand|      -9.199121|      -3.554791|     -1.0468647|-0.16218652|-0.022104237| 0.05965481|
|      a|  gear|stand|      -9.233239|     -3.5488055|     -1.0313023|-0.16218652|-0.022104237| 0.05965481|
|      a|  gear|stand|      -9.236232|      -3.537433|     -1.0456676|-0.16218652|-0.022104237| 0.05965481|
|      a|  gear|stand|      -9.240421|     -3.4913447|     -1.0588357|

In [36]:
# Concatenate df_phones_concat and df_watch_concat DataFrames with union
df_concat = df_phones_concat.union(df_watch_concat)
print("Concatenated Phones_accelerometer, Phones_gyroscope, Watch_accelerometer and Watch_gyroscope DataFrames: ")
df_concat.show()

Concatenated Phones_accelerometer, Phones_gyroscope, Watch_accelerometer and Watch_gyroscope DataFrames: 
+-------+------+-----+------------------+------------------+-----------------+-----------+--------------------+------------+
|usuario|modelo|clase|   x_accelerometer|   y_accelerometer|  z_accelerometer|x_gyroscope|         y_gyroscope| z_gyroscope|
+-------+------+-----+------------------+------------------+-----------------+-----------+--------------------+------------+
|      a|nexus4|stand|        -6.0462646|         1.0082245|7.952057000000001|0.013748169|-6.25610350000000...|-0.023376465|
|      a|nexus4|stand|        -6.0272217|        0.99394226|7.929443400000001|0.013748169|-6.25610350000000...|-0.023376465|
|      a|nexus4|stand|        -6.0593567|0.9915619000000001|7.967529300000001|0.013748169|-6.25610350000000...|-0.023376465|
|      a|nexus4|stand|         -6.034363|0.9832306000000001|7.960388000000001|0.013748169|-6.25610350000000...|-0.023376465|
|      a|nexus4|sta