In [1]:
pip install plotly

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install missingno

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sparkConf = SparkConf()
sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
sparkConf.setAppName("spark")

sparkConf.set("spark.kubernetes.container.image", "docker.io/iscp/pyspark-executor:latest")
sparkConf.set("spark.driver.host", "sn-deployment.spark.svc.cluster.local")

sparkConf.set("spark.executor.instances", "4")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.memory", "1g")
sparkConf.set("spark.executor.memory", "1g")

sparkConf.set("spark.kubernetes.namespace", "spark")
sparkConf.set("spark.kubernetes.driver.volumes.persistentVolumeClaim.snpvc.mount.path", "/home/volumes")
sparkConf.set("spark.kubernetes.driver.volumes.persistentVolumeClaim.snpvc.options.claimName", "snpvc")
sparkConf.set("spark.kubernetes.executor.volumes.persistentVolumeClaim.snpvc.mount.path", "/home/volumes")
sparkConf.set("spark.kubernetes.executor.volumes.persistentVolumeClaim.snpvc.options.claimName", "snpvc")

sparkConf.set('spark.sql.adaptive.enabled','True')
sparkConf.set('spark.kubernetes.driver.pod.name','sn-deployment-0')
# sparkConf.set('spark.dynamicAllocation.enabled', 'True')
# sparkConf.set('spark.dynamicAllocation.shuffleTracking.enabled', 'True')
# sparkConf.set('spark.dynamicAllocation.executorIdleTimeout', '2min')
# sparkConf.set('spark.dynamicAllocation.minExecutors', '1')
# sparkConf.set('spark.dynamicAllocation.maxExecutors', '1')
# sparkConf.set('spark.dynamicAllocation.schedulerBacklogTimeout', '1m')
# sparkConf.set('spark.shuffle.service.enabled', 'True')

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

KeyboardInterrupt: 

age
sex: sex (1 = male; 0 = female)
cp: chest pain type
        -- Value 1: typical angina
        -- Value 2: atypical angina
        -- Value 3: non-anginal pain
        -- Value 4: asymptomatic
trestbps: resting blood pressure (in mm Hg on admission to the 
        hospital)
12 chol: serum cholestoral in mg/dl
16 fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
19 restecg: resting electrocardiographic results
        -- Value 0: normal
        -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST 
                    elevation or depression of > 0.05 mV)
        -- Value 2: showing probable or definite left ventricular hypertrophy
                    by Estes' criteria
32 thalach: maximum heart rate achieved
38 exang: exercise induced angina (1 = yes; 0 = no)
     40 oldpeak = ST depression induced by exercise relative to rest
41 slope: the slope of the peak exercise ST segment
        -- Value 1: upsloping
        -- Value 2: flat
        -- Value 3: downsloping

 44 ca: number of major vessels (0-3) colored by flourosopy

 51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
 58 num: diagnosis of heart disease (angiographic disease status)
    -- Value 0: < 50% diameter narrowing
    -- Value 1: > 50% diameter narrowing
    (in any major vessel: attributes 59 through 68 are vessels)


In [None]:
# id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
# id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982154846191406,40.767936706542969,-73.964630126953125,40.765602111816406,N,455
# id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415344238281,40.738563537597656,-73.999481201171875,40.731151580810547,N,663

In [None]:
schema = """id STRING, vendor_id INT, pickup_datetime DATETIME,
    dropoff_datetime DATETIME, passenger_count INT
    pickup_longitude FLOAT, pickup_latitude FLOAT,
    dropoff_longitude FLOAT, dropoff_latitude FLOAT,
    store_and_fwd_flag STRING, trip_duration INT"""

In [None]:
df = spark.read.csv('train.csv', schema=schema, header=True, samplingRatio=.1)

In [None]:
df.show(5)

In [None]:
df.head(2)

In [None]:
df.printSchema()

In [None]:
df.count(),len(df.columns)

In [None]:
df.summary().show()

### Remove Duplicate Rows

In [None]:
df_d = df.distinct()

### Get Summary of data

In [None]:
df_d.summary().show()

In [None]:
df_d.dtypes

### Check how many unique values per column

In [None]:
pd_df = df.toPandas()

In [None]:
pd_df.nunique()

In [None]:
list_of_nunique = [f.countDistinct(c) for c in df_d.columns]
df_d.select(list_of_nunique).show()

### Validate columns that have bad data

In [None]:
df_d.select('ca').distinct().show()

In [None]:
df_d.groupBy('ca').count().show()

In [None]:
df_d.where(f.col('ca') == '?').show()

In [None]:
df_d = df_d.replace("?", None, subset = "ca")
df_d.groupby('ca').count().show()

In [None]:
df_d = df_d.replace("?",None,subset="thal")
df_d.groupby("thal").count().show()

### Find Null counts

#### List Comprehension

In [None]:
list_col = [f.count(f.when(f.col(c).isNull(),c)).alias(c) for c in df_d.columns]

df_d.select(list_col).show()

#### Map

In [None]:
def get_null_cnt(c):
    return f.count(f.when(f.col(c).isNull(),c)).alias(c)

map_of_nulls = map(get_null_cnt, df_d.columns )
list_null_cnt = list(map_of_nulls)

df_d.select(list_null_cnt).show()

### Get condition counts for all columns: check 0 values

In [None]:
def get_null_cnt(c):
    return f.count(f.when(f.col(c) == '0',c)).alias(c)

map_of_nulls = map(get_null_cnt, df_d.columns )
list_null_cnt = list(map_of_nulls)

df_d.select(list_null_cnt).show()

### Fill nulls with Median

In [None]:
df_d = df_d.withColumn('ca', f.col('ca').cast(types.IntegerType()))

ca_median = df_d.approxQuantile("ca", [0.5], 0.25)

df_d = df_d.na.fill(ca_median[0])

########################################

df_d = df_d.withColumn('thal', f.col('thal').cast(types.IntegerType()))

thal_median = df_d.approxQuantile("thal", [0.5], 0.25)

df_d = df_d.na.fill(thal_median[0])

########################################

list_col = [f.count(f.when(f.col(c).isNull(),c)).alias(c) for c in df_d.columns]

df_d.select(list_col).show()


### Get Mean for filling

In [None]:
df_d.select('ca').agg(f.avg("ca")).collect()[0][0]

# df_d = df_d.replace(None,f.mean())

### Get summary again

In [None]:
df_d.describe().show()

### Replace features with names for easier to understand plots

In [None]:
df_p = df_d

df_p = df_p.withColumn('target',f.col('num').cast("string"))
df_p = df_p.replace(['1','2','3','4'],"Disease", subset='target')
df_p = df_p.replace('0',"No_Disease", subset='target')

df_p = df_p.withColumn('sex',f.col('sex').cast('string'))
df_p = df_p.replace({'1.0': "Male", '0.0': "Female"}, subset='sex')

df_p = df_p.withColumn('cp', f.col('cp').cast('string'))
df_p = df_p.replace({'2.0': "atypical_angina", '1.0': 'typical_angina', '3.0': 'non-anginal pain', '4.0': 'asymptomatic'},
            subset='cp')

df_p = df_p.withColumn('exang', f.col('exang').cast('string'))
df_p = df_p.replace({'0.0': 'True', '1.0': 'False'}, subset = 'exang')

df_p = df_p.withColumn('fbs', f.col('fbs').cast('string'))
df_p = df_p.replace({'0.0': 'True', '1.0': 'False'}, subset = 'fbs')

df_p = df_p.withColumn('slope', f.col('slope').cast('string'))
df_p = df_p.replace({'1.0': 'upsloping', '2.0': 'flat', '3.0': 'downsloping'}, subset = 'slope')

df_p = df_p.withColumn('thal', f.col('thal').cast('string'))
df_p = df_p.replace({'3': 'fixed_defect', '6': 'fixed_defect', '7': "reversable_defect"}, subset = 'thal')

df_p.show()

df_p.dtypes


### Find Outliers

In [None]:
pd_df = df_d.toPandas()

In [None]:
pd_df.plot(kind='box', subplots='True',layout=(2,7),sharex=False,sharey=False, figsize=(20, 10))

In [None]:
# fig = px.box(pd_df, x='target', y='chol')
# fig.show()

In [None]:
pd_df = df_p.toPandas()
sns.boxplot(x='target',y='chol',data=pd_df,width=1)

#### Create the IQR bounds
https://blog.zhaytam.com/2019/07/15/outliers-detection-in-pyspark-2-interquartile-range/

In [None]:
## demonstrate Zip an list comprehensions. Curly's used for dictionary, c is first element : second element
## in this case the second element is going to be a list of the bounds calculated by approxQuantile.
## Use if statements to get only the numerical datatypes
{c: d for c,d in zip(df_p.columns,df_p.dtypes) if (d[1] == 'double' or d[1] == 'int') }

In [None]:
### Dictionary Method: create a dictionary of dictionaries for nested addressing later. More complex but better
### Var naming
dict_bounds = {c: dict(zip(['q1','q3'],df_p.approxQuantile(c,[0.25,.75],0)))
          for c,d in zip(df_p.columns,df_p.dtypes) if (d[1] == 'double' or d[1] == 'int') }
dict_bounds

In [None]:
for c in dict_bounds:
    iqr = dict_bounds[c]['q3'] - dict_bounds[c]['q1']
    dict_bounds[c]['min'] = dict_bounds[c]['q1'] - iqr*1.5
    dict_bounds[c]['max'] = dict_bounds[c]['q3'] + iqr*1.5
    
dict_bounds

#### Apply bounds and create new column with outliers flagged

In [None]:
for c in dict_bounds:
    df_p = df_p.withColumn(c+'_outlier', f.when(
        ~f.col(c).between(dict_bounds[c]['min'], dict_bounds[c]['max']), 'True')
                .otherwise('False'))

df_p.show()


#### Verify Outliers

In [None]:
df_p.select('*').where(f.col('chol_outlier') == True).show()

#### Count and Remove Outliers

In [None]:
{c: df_p.where(f.col(c) == True).count() for c in df_p.columns if c[-7:] == 'outlier'}

In [None]:
df_nout = df_p.select('*').where(
    (f.col('age_outlier') == False)
    & (f.col("trestbps_outlier") == False)
    & (f.col("chol_outlier") == False)
    & (f.col("restecg_outlier") == False)
    & (f.col("thalach_outlier") == False)
    & (f.col("oldpeak_outlier") == False)
    & (f.col("ca_outlier") == False)
    & (f.col("num_outlier") == False)
    )
df_nout.show()