# DSE 230 - Final Project Notebook - Biometric Data - Classification

In [1]:
### package requirements
# numpy v.


In [2]:
import pandas as pd
import numpy as np
from scipy.io import arff
from glob import glob
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, LinearSVC, OneVsRest, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
# go get data

#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00507/wisdm-dataset.zip
#!unzip -o -j "wisdm-dataset.zip" "wisdm-dataset/arff_files/phone/accel/data_1600_accel_phone.arff"
#!unzip -o -j "wisdm-dataset.zip" "wisdm-dataset/arff_files/*"

In [4]:
glob('data_*_accel_watch.arff')

['data_1632_accel_watch.arff',
 'data_1621_accel_watch.arff',
 'data_1619_accel_watch.arff',
 'data_1639_accel_watch.arff',
 'data_1644_accel_watch.arff',
 'data_1602_accel_watch.arff',
 'data_1603_accel_watch.arff',
 'data_1635_accel_watch.arff',
 'data_1631_accel_watch.arff',
 'data_1638_accel_watch.arff',
 'data_1627_accel_watch.arff',
 'data_1606_accel_watch.arff',
 'data_1604_accel_watch.arff',
 'data_1650_accel_watch.arff',
 'data_1618_accel_watch.arff',
 'data_1640_accel_watch.arff',
 'data_1605_accel_watch.arff',
 'data_1613_accel_watch.arff',
 'data_1615_accel_watch.arff',
 'data_1622_accel_watch.arff',
 'data_1642_accel_watch.arff',
 'data_1623_accel_watch.arff',
 'data_1608_accel_watch.arff',
 'data_1612_accel_watch.arff',
 'data_1607_accel_watch.arff',
 'data_1637_accel_watch.arff',
 'data_1620_accel_watch.arff',
 'data_1646_accel_watch.arff',
 'data_1601_accel_watch.arff',
 'data_1617_accel_watch.arff',
 'data_1641_accel_watch.arff',
 'data_1648_accel_watch.arff',
 'data_1

In [5]:
def load_arffs(wildcard_path):
    arff_list = glob(wildcard_path)
    df = pd.DataFrame()
    for file in arff_list:
        data = arff.loadarff(file)
        df = pd.concat([df, pd.DataFrame(data[0])])
    print(df.shape)
    return df

In [6]:
def make_dataframes():   
    
    global df_accel_watch, df_accel_phone, df_gyro_watch, df_gyro_phone
    
    df_accel_watch = load_arffs('data_*_accel_watch.arff')
    df_accel_phone = load_arffs('data_*_accel_phone.arff')
    df_gyro_watch = load_arffs('data_*_gyro_watch.arff')
    df_gyro_phone = load_arffs('data_*_gyro_phone.arff')

In [7]:
def data_cleanup(df):
    df.columns = [col.replace('"','') for col in df.columns.tolist()]
    df['ACTIVITY'] = [x.decode('utf-8') for x in df['ACTIVITY']]
    df['class'] = [x.decode('utf-8') for x in df['class']]
    df['label'] = df['ACTIVITY'].astype('category').cat.codes.astype('int')
    return df

In [8]:
def run_cleanup():
    
    global df_accel_watch, df_accel_phone, df_gyro_watch, df_gyro_phone
    
    df_accel_watch = data_cleanup(df_accel_watch)
    df_accel_phone = data_cleanup(df_accel_phone)
    df_gyro_watch = data_cleanup(df_gyro_watch)
    df_gyro_phone = data_cleanup(df_gyro_phone)

In [9]:
def add_data_descriptive_columns():
    df_accel_watch['device_type'], df_accel_watch['sensor_type'] = ('watch', 'accelerometer')
    df_accel_phone['device_type'], df_accel_phone['sensor_type'] = ('phone', 'accelerometer')
    df_gyro_watch['device_type'], df_gyro_watch['sensor_type'] = ('watch', 'gyroscope')
    df_gyro_phone['device_type'], df_gyro_phone['sensor_type'] = ('phone', 'gyroscope')

In [10]:
def spark_preprocessing(spark_df):
    # remove na values
    before_drop = spark_df.count()
    spark_df = spark_df.dropna(how='any')
    after_drop = spark_df.count()

    print('{} rows dropped for having NA values'.format(before_drop - after_drop))

    # create feature vector column
    features = spark_df.columns[1:-4]
    assembler = VectorAssembler(inputCols=features, outputCol='featureVector')
    spark_df = assembler.transform(spark_df)

    # split data into test and training
    train_df, test_df = spark_df.randomSplit([.8,.2], seed=42)

    # scale input data
    scaler = StandardScaler(inputCol='featureVector', outputCol='featureVector_scaled', withStd=True, withMean=True)
    scalerModel = scaler.fit(train_df)

    train_df = scalerModel.transform(train_df)
    test_df = scalerModel.transform(test_df)
    
    return train_df, test_df

In [11]:
make_dataframes()
run_cleanup()
add_data_descriptive_columns()

(18211, 93)
(23074, 93)
(16533, 93)
(17281, 93)


In [12]:
df_gyro_phone.columns

Index(['ACTIVITY', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9',
       'Y0', 'Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 'Y7', 'Y8', 'Y9', 'Z0', 'Z1',
       'Z2', 'Z3', 'Z4', 'Z5', 'Z6', 'Z7', 'Z8', 'Z9', 'XAVG', 'YAVG', 'ZAVG',
       'XPEAK', 'YPEAK', 'ZPEAK', 'XABSOLDEV', 'YABSOLDEV', 'ZABSOLDEV',
       'XSTANDDEV', 'YSTANDDEV', 'ZSTANDDEV', 'XVAR', 'YVAR', 'ZVAR', 'XMFCC0',
       'XMFCC1', 'XMFCC2', 'XMFCC3', 'XMFCC4', 'XMFCC5', 'XMFCC6', 'XMFCC7',
       'XMFCC8', 'XMFCC9', 'XMFCC10', 'XMFCC11', 'XMFCC12', 'YMFCC0', 'YMFCC1',
       'YMFCC2', 'YMFCC3', 'YMFCC4', 'YMFCC5', 'YMFCC6', 'YMFCC7', 'YMFCC8',
       'YMFCC9', 'YMFCC10', 'YMFCC11', 'YMFCC12', 'ZMFCC0', 'ZMFCC1', 'ZMFCC2',
       'ZMFCC3', 'ZMFCC4', 'ZMFCC5', 'ZMFCC6', 'ZMFCC7', 'ZMFCC8', 'ZMFCC9',
       'ZMFCC10', 'ZMFCC11', 'ZMFCC12', 'XYCOS', 'XZCOS', 'YZCOS', 'XYCOR',
       'XZCOR', 'YZCOR', 'RESULTANT', 'class', 'label', 'device_type',
       'sensor_type'],
      dtype='object')

In [13]:
# combine all data frames and convert to numpy array

df = pd.concat([df_accel_watch, df_accel_phone, df_gyro_watch, df_gyro_phone])
df.shape

(75099, 96)

In [14]:
# Initialize Spark Session
conf = pyspark.SparkConf().setAll([
    ('spark.master', 'local[*]'),
    ('spark.app.name', 'PySpark DSE230 Final')])

try: 
    if type(spark) == 'pyspark.sql.session.SparkSession':
        spark.stop()
        spark = SparkSession.builder.config(conf=conf).getOrCreate()
except:
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    print (spark.version, pyspark.version.__version__)

3.1.1 3.1.1


In [15]:
# convert Pandas dataframe to Spark dataframe

spark_df = spark.createDataFrame(df)

#from pyspark.ml.feature import StringIndexer
#stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
#si_model = stringIndexer.fit(df)
#td = si_model.transform(df)


In [16]:
# separate data into individual descriptives data frames

spark_df_accel_watch = spark_df.select('*').where((spark_df.device_type == 'watch') & (spark_df.sensor_type == 'accelerometer'))
spark_df_gyro_watch = spark_df.select('*').where((spark_df.device_type == 'watch') & (spark_df.sensor_type == 'gyroscope'))
spark_df_accel_phone = spark_df.select('*').where((spark_df.device_type == 'phone') & (spark_df.sensor_type == 'accelerometer'))
spark_df_gyro_phone = spark_df.select('*').where((spark_df.device_type == 'phone') & (spark_df.sensor_type == 'gyroscope'))

In [17]:
# conduct data pre-processing in Spark

train_df_accel_watch, test_df_accel_watch = spark_preprocessing(spark_df_accel_watch)
train_df_gyro_watch, test_df_gyro_watch = spark_preprocessing(spark_df_gyro_watch)
train_df_accel_phone, test_df_accel_phone = spark_preprocessing(spark_df_accel_phone)
train_df_gyro_phone, test_df_gyro_phone = spark_preprocessing(spark_df_gyro_phone)

0 rows dropped for having NA values
0 rows dropped for having NA values
0 rows dropped for having NA values
0 rows dropped for having NA values


In [18]:
train_df_accel_watch.show(1)

+--------+---+---+---+---+---+----+----+----+----+---+----+----+----+----+---+---+---+---+---+---+-----+-----+-----+-----+---+---+---+---+---+---+-------+--------+--------+-----+-------+-------+---------+---------+---------+---------+---------+---------+--------+-------+------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+---------+---------+--------+--------+---------+--------+---------+-----+-----+-----------+-------------+--------------------+--------------------+
|ACTIVITY| X0| X1| X2| X3| X4|  X5|  X6|  X7|  X8| X9|  Y0|  Y1|  Y2|  Y3| Y4| Y5| Y6| Y7| Y8| Y9|   Z0|   Z1|   Z2|   Z3| Z4| Z5| Z6| Z7| Z8| Z9|   XAVG|    YAVG|    ZAVG|XPEAK|  YPEAK|  ZPEAK|XABSOLDEV|YABSOLDEV|ZABSOLDEV|XS

In [20]:
# Classification - One v. Rest - Logistic Regression

train_data = train_df_accel_watch
test_data = test_df_accel_watch
classifier = LogisticRegression(maxIter=10000)
feature_col = 'featureVector_scaled'
label_col = 'label'
evaluation_metric = 'accuracy'

train_data = train_data.select(feature_col, label_col). \
                        withColumnRenamed(feature_col, 'features'). \
                        withColumnRenamed(label_col, 'label')
test_data = test_data.select(feature_col, label_col). \
                      withColumnRenamed(feature_col, 'features'). \
                      withColumnRenamed(label_col, 'label')

#train_data = train_data.select('*').where((train_data.label == 'A') | (train_data.label == 'B'))
%% time

ovr = OneVsRest(classifier=classifier)
model = ovr.fit(train_data)
eval_data = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(metricName=evaluation_metric)
evaluator.evaluate(eval_data)

0.7010596765197992

In [21]:
classifier=LinearSVC(maxIter=100)

%% time

ovr = OneVsRest(classifier=classifier)
model = ovr.fit(train_data)
eval_data = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(metricName=evaluation_metric)
evaluator.evaluate(eval_data)

0.660345789180145

In [None]:
https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.classification.RandomForestClassifier

In [None]:
train_data.show(1)

In [None]:
# train model - classifer

In [None]:
# generate predictions and score

In [None]:
# visualize prediction / model performance

In [None]:
# load data into dataframe for cleanup

data = arff.loadarff('data_1600_accel_phone.arff')
df = pd.DataFrame(data[0])
df.shape

In [None]:
# data cleanup in Pandas

df.columns = [col.replace('"','') for col in df.columns.tolist()]
df['ACTIVITY'] = [x.decode('utf-8') for x in df['ACTIVITY']]
df['class'] = [x.decode('utf-8') for x in df['class']]

df.head()

In [None]:
print(df.iloc[0,:].tolist())

In [None]:
df.iloc[0,1:31].sum()

In [None]:
df.columns

In [None]:


arr = df.to_array()

In [None]:
#df.ACTIVITY.decode('utf-8')
#[x.replace("b'", "") for x in df.ACTIVITY]
[x.decode('utf-8') for x in df.ACTIVITY]

In [None]:
type(df.ACTIVITY[1])

In [None]:
data[0]