## Install pyspark

In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 67 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 57.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=090ec67c4320c6ea13bda829efa0a6ed1f26379382f362bc77ca2bf0ff247521
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [2]:
import pyspark
import numpy as np
import pandas as pd
import glob
from sklearn.datasets import load_wine
print(pyspark.__version__)

3.1.2


## Sample dataset

In [3]:
# Load dataset
bundle = load_wine()
data = bundle.data
columns = bundle.feature_names
target = bundle.target

sample_df = pd.DataFrame(data = data, columns = columns)
sample_df['target'] = target
sample_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [4]:
# Save
sample_df.to_csv('dataset.csv', index = False)

# Look up directory
glob.glob("./*.csv")

['./dataset.csv']

## Create Session

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName("Karma").getOrCreate()

In [6]:
spark

Read dataset

In [7]:
# Read Csv
df = spark.read.csv(path = './dataset.csv')
df.show()

+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+--------------------+-------+------+
|    _c0|       _c1| _c2|              _c3|      _c4|          _c5|       _c6|                 _c7|            _c8|            _c9|_c10|                _c11|   _c12|  _c13|
+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+--------------------+-------+------+
|alcohol|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_di...|proline|target|
|  14.23|      1.71|2.43|             15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04|                3.92| 1065.0|     0|
|   13.2|      1.78|2.14|             11.2|    100.0|         2.65|      2.76|                0.26|           1.28|           4.38|1.05

In [8]:
# Header Option
df = spark.read.option('header', 'true').csv(path = './dataset.csv')
df.show()

+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+
|alcohol|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_diluted_wines|proline|target|
+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+
|  14.23|      1.71|2.43|             15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04|                        3.92| 1065.0|     0|
|   13.2|      1.78|2.14|             11.2|    100.0|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|                         3.4| 1050.0|     0|
|  13.16|      2.36|2.67|             18.6|    101.0|          2.8|      3.24|                 

In [9]:
# Print Schema
# string type ---> double type
df.printSchema()

root
 |-- alcohol: string (nullable = true)
 |-- malic_acid: string (nullable = true)
 |-- ash: string (nullable = true)
 |-- alcalinity_of_ash: string (nullable = true)
 |-- magnesium: string (nullable = true)
 |-- total_phenols: string (nullable = true)
 |-- flavanoids: string (nullable = true)
 |-- nonflavanoid_phenols: string (nullable = true)
 |-- proanthocyanins: string (nullable = true)
 |-- color_intensity: string (nullable = true)
 |-- hue: string (nullable = true)
 |-- od280/od315_of_diluted_wines: string (nullable = true)
 |-- proline: string (nullable = true)
 |-- target: string (nullable = true)



In [10]:
# Infer Schema Option
df = spark.read.csv(path = './dataset.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- alcohol: double (nullable = true)
 |-- malic_acid: double (nullable = true)
 |-- ash: double (nullable = true)
 |-- alcalinity_of_ash: double (nullable = true)
 |-- magnesium: double (nullable = true)
 |-- total_phenols: double (nullable = true)
 |-- flavanoids: double (nullable = true)
 |-- nonflavanoid_phenols: double (nullable = true)
 |-- proanthocyanins: double (nullable = true)
 |-- color_intensity: double (nullable = true)
 |-- hue: double (nullable = true)
 |-- od280/od315_of_diluted_wines: double (nullable = true)
 |-- proline: double (nullable = true)
 |-- target: integer (nullable = true)



## Slicing

Column

In [11]:
# Single Column
single = df.select("alcohol")
single.show(5)

+-------+
|alcohol|
+-------+
|  14.23|
|   13.2|
|  13.16|
|  14.37|
|  13.24|
+-------+
only showing top 5 rows



In [12]:
# Multiple Column
multiple = df.select(["alcohol", "malic_acid"])
multiple.show(5)

+-------+----------+
|alcohol|malic_acid|
+-------+----------+
|  14.23|      1.71|
|   13.2|      1.78|
|  13.16|      2.36|
|  14.37|      1.95|
|  13.24|      2.59|
+-------+----------+
only showing top 5 rows



## describe

In [13]:
df.describe().show()

+-------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+------------------+-----------------+-------------------+----------------------------+-----------------+------------------+
|summary|           alcohol|        malic_acid|               ash|alcalinity_of_ash|         magnesium|     total_phenols|        flavanoids|nonflavanoid_phenols|   proanthocyanins|  color_intensity|                hue|od280/od315_of_diluted_wines|          proline|            target|
+-------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+------------------+-----------------+-------------------+----------------------------+-----------------+------------------+
|  count|               178|               178|               178|              178|               178|               178|               178| 

## Add Column

In [14]:
# Add new column
df = df.withColumn('double_alcohol', df['alcohol']*2)

In [15]:
df.printSchema()

root
 |-- alcohol: double (nullable = true)
 |-- malic_acid: double (nullable = true)
 |-- ash: double (nullable = true)
 |-- alcalinity_of_ash: double (nullable = true)
 |-- magnesium: double (nullable = true)
 |-- total_phenols: double (nullable = true)
 |-- flavanoids: double (nullable = true)
 |-- nonflavanoid_phenols: double (nullable = true)
 |-- proanthocyanins: double (nullable = true)
 |-- color_intensity: double (nullable = true)
 |-- hue: double (nullable = true)
 |-- od280/od315_of_diluted_wines: double (nullable = true)
 |-- proline: double (nullable = true)
 |-- target: integer (nullable = true)
 |-- double_alcohol: double (nullable = true)



## Drop Column

In [16]:
df = df.drop('double_alcohol')

In [17]:
df.printSchema()

root
 |-- alcohol: double (nullable = true)
 |-- malic_acid: double (nullable = true)
 |-- ash: double (nullable = true)
 |-- alcalinity_of_ash: double (nullable = true)
 |-- magnesium: double (nullable = true)
 |-- total_phenols: double (nullable = true)
 |-- flavanoids: double (nullable = true)
 |-- nonflavanoid_phenols: double (nullable = true)
 |-- proanthocyanins: double (nullable = true)
 |-- color_intensity: double (nullable = true)
 |-- hue: double (nullable = true)
 |-- od280/od315_of_diluted_wines: double (nullable = true)
 |-- proline: double (nullable = true)
 |-- target: integer (nullable = true)



Rename Column

In [18]:
df = df.withColumnRenamed('alcohol', '알코올')

In [19]:
df.printSchema()

root
 |-- 알코올: double (nullable = true)
 |-- malic_acid: double (nullable = true)
 |-- ash: double (nullable = true)
 |-- alcalinity_of_ash: double (nullable = true)
 |-- magnesium: double (nullable = true)
 |-- total_phenols: double (nullable = true)
 |-- flavanoids: double (nullable = true)
 |-- nonflavanoid_phenols: double (nullable = true)
 |-- proanthocyanins: double (nullable = true)
 |-- color_intensity: double (nullable = true)
 |-- hue: double (nullable = true)
 |-- od280/od315_of_diluted_wines: double (nullable = true)
 |-- proline: double (nullable = true)
 |-- target: integer (nullable = true)



## Handling NA

In [20]:
df = df.na.drop(how = 'all')

In [21]:
df = df.na.drop(how = 'any')

In [22]:
df = df.na.fill("^")

In [23]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols = ['알코올'],
    outputCols = ['inputed_alcohol']
).setStrategy('mean')

In [24]:
df = imputer.fit(df).transform(df)

## Filter

In [25]:
df.filter("target == 1").select("알코올").show(5)

+------+
|알코올|
+------+
| 12.37|
| 12.33|
| 12.64|
| 13.67|
| 12.37|
+------+
only showing top 5 rows



In [26]:
df.filter(df['target'] == 2).show(5)

+------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+---------------+
|알코올|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_diluted_wines|proline|target|inputed_alcohol|
+------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+---------------+
| 12.86|      1.35|2.32|             18.0|    122.0|         1.51|      1.25|                0.21|           0.94|            4.1|0.76|                        1.29|  630.0|     2|          12.86|
| 12.88|      2.99| 2.4|             20.0|    104.0|          1.3|      1.22|                0.24|           0.83|            5.4|0.74|                        1.42|  530.0|     2|          12.88|
| 12.81|      2.31| 2.4

In [27]:
df.filter((df['알코올'] >12) & (df['malic_acid']<2)).show(5)

+------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+---------------+
|알코올|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_diluted_wines|proline|target|inputed_alcohol|
+------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+---------------+
| 14.23|      1.71|2.43|             15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04|                        3.92| 1065.0|     0|          14.23|
|  13.2|      1.78|2.14|             11.2|    100.0|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|                         3.4| 1050.0|     0|           13.2|
| 14.37|      1.95| 2.5

## Groupby

In [28]:
df.groupBy('target').mean().show()

+------+------------------+------------------+------------------+----------------------+------------------+------------------+------------------+-------------------------+--------------------+--------------------+------------------+---------------------------------+------------------+-----------+--------------------+
|target|       avg(알코올)|   avg(malic_acid)|          avg(ash)|avg(alcalinity_of_ash)|    avg(magnesium)|avg(total_phenols)|   avg(flavanoids)|avg(nonflavanoid_phenols)|avg(proanthocyanins)|avg(color_intensity)|          avg(hue)|avg(od280/od315_of_diluted_wines)|      avg(proline)|avg(target)|avg(inputed_alcohol)|
+------+------------------+------------------+------------------+----------------------+------------------+------------------+------------------+-------------------------+--------------------+--------------------+------------------+---------------------------------+------------------+-----------+--------------------+
|     1|12.278732394366198| 1.932676056338028|