In [21]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00507/wisdm-dataset.zip

--2020-06-21 15:38:05--  https://archive.ics.uci.edu/ml/machine-learning-databases/00507/wisdm-dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 309713877 (295M) [application/x-httpd-php]
Saving to: ‘wisdm-dataset.zip.1’


2020-06-21 15:38:08 (84.4 MB/s) - ‘wisdm-dataset.zip.1’ saved [309713877/309713877]



In [22]:
! unzip wisdm-dataset.zip

Archive:  wisdm-dataset.zip
replace wisdm-dataset/WISDM-dataset-description.pdf? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [23]:
! ls wisdm-dataset/raw

phone  watch


In [27]:
import os

In [28]:
!pip install pyspark==2.4.5



In [29]:
try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')

In [30]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

In [31]:
#OUR SCHEMA IS: subjectid,target,timestamp,x,y,z
from pyspark.sql.types import StructType, StructField, IntegerType,StringType,DoubleType
schema = StructType([
    StructField("subjectid",IntegerType(),True),
    StructField("activity",StringType(),True),
    StructField("timestamp",DoubleType(),True),
    StructField("x",DoubleType(),True),
    StructField("y",DoubleType(),True),
    StructField("z",StringType(),True)])#sets z value to string because last char is a ';', will remove in pandas later.


In [32]:
imported_df = None

from pyspark.sql.functions import lit #allows strings/literal to apache spark dataframes
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import expr

raw_path = 'wisdm-dataset/raw/'
file_list_filtered = ['phone','watch']
devices = ['accel','gyro']

for category in file_list_filtered:
    for device in devices:
        path = raw_path+category+'/'+device
        data_files = os.listdir(path)
        
        for data_file in data_files:
            print(path+'/'+data_file)
            temp_df = spark.read.csv(path+'/'+data_file, schema=schema)
            temp_df = temp_df.withColumn('type',lit(category))
            temp_df = temp_df.withColumn('device',lit(device))
            
            if imported_df is None:
                imported_df = temp_df
            else:
                imported_df = imported_df.union(temp_df)
imported_df.printSchema()

wisdm-dataset/raw/phone/accel/data_1627_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1646_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1635_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1616_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1629_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1623_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1606_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1609_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1630_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1620_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1639_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1648_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1615_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1602_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1643_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1622_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1650_accel_phone.txt
wisdm-dataset/raw/phone/accel/data_1604_accel_ph

In [33]:
df = imported_df.withColumn('z', regexp_replace('z', ';', '')) #removes ';' from imported z column
df.withColumn("z", expr("CAST(z AS DOUBLE)"))

df.show()

+---------+--------+-------------------+------------+----------+----------+-----+------+
|subjectid|activity|          timestamp|           x|         y|         z| type|device|
+---------+--------+-------------------+------------+----------+----------+-----+------+
|     1627|       A|4.42607245307544E14|   4.5457916| -8.881348| -9.197556|phone| accel|
|     1627|       A|4.42607265449146E14|-0.114730835|-12.393082| -8.104095|phone| accel|
|     1627|       A|4.42607285590747E14|    1.510315|-13.205139| -6.772949|phone| accel|
|     1627|       A|4.42607305732349E14|   2.7474518|-11.763794|-4.9213257|phone| accel|
|     1627|       A|4.42607325873951E14|    3.040924| -9.194626|-5.7395935|phone| accel|
|     1627|       A|4.42607346015552E14|   4.1273346| -9.052307|-5.5689545|phone| accel|
|     1627|       A|4.42607366157154E14|    4.469742| -9.601379| -5.090271|phone| accel|
|     1627|       A|4.42607386298755E14|   2.2313538| -9.543732|-3.5087433|phone| accel|
|     1627|       A|4

In [35]:
df.describe().show()

+-------+------------------+--------+--------------------+-------------------+-------------------+------------------+--------+--------+
|summary|         subjectid|activity|           timestamp|                  x|                  y|                 z|    type|  device|
+-------+------------------+--------+--------------------+-------------------+-------------------+------------------+--------+--------+
|  count|          15630426|15630426|            15630426|           15630426|           15630426|          15630426|15630426|15630426|
|   mean|1626.2665032290226|    null|5.904472966529589E14|0.12915970569615745|-1.6078491830922514|  0.55704841220647|    null|    null|
| stddev|14.788169916235487|    null|6.034456984771618E14|  4.792883900025532| 5.0252669309156754|3.9709000621404074|    null|    null|
|    min|              1600|       A|    2.68102711042E11|          -78.47761|          -78.47761|     -0.0010000169|   phone|   accel|
|    max|              1650|       S|2.724599224

In [25]:
ls

[0m[01;34mclassifyactivities.parquet[0m/  wisdm-dataset.zip
[01;34mwisdm-dataset[0m/               wisdm-dataset.zip.1


In [18]:
ls classifyactivities.parquet/

part-00000-f963ecf1-3b5a-417b-99e3-2e540e9372d1-c000.snappy.parquet  _SUCCESS
