# Introduction to Spark

### Importing the necessary libraries

In [1]:
import os
import re
import json

import warnings
from pprint import pprint

# DML
import numpy as np
import pandas as pd

# Spark
from pyspark.sql import SparkSession

...

Ellipsis

### Setup and configuration

In [2]:
spark = SparkSession.builder \
            .appName('Test0') \
            .getOrCreate()

In [3]:
spark

## DQL

In [61]:
filepath = r"./datasets/Sample1.csv"

df = spark.read \
        .option('header', 'true') \
        .csv(filepath, inferSchema=True)
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|  Himmler| 23|         2|
|  Hermann| 22|         1|
|     Hans| 21|         0|
|Hellstrom| 25|         3|
+---------+---+----------+



In [32]:
type(df)

pyspark.sql.classic.dataframe.DataFrame

### Fetching records

In [33]:
for row in df.head(5):
    name, age, exp = row
    print(f"{name}, {age}, {exp}")

Himmler, 23, 2
Hermann, 22, 1
Hans, 21, 0
Hellstrom, 25, 3


### Visualizing schema

In [35]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [36]:
df.columns

['Name', 'Age', 'Experience']

### Selecting columns

In [41]:
for column in df.columns:
    df.select(column).show()

+---------+
|     Name|
+---------+
|  Himmler|
|  Hermann|
|     Hans|
|Hellstrom|
+---------+

+---+
|Age|
+---+
| 23|
| 22|
| 21|
| 25|
+---+

+----------+
|Experience|
+----------+
|         2|
|         1|
|         0|
|         3|
+----------+



In [43]:
df.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [45]:
df.describe().show()

+-------+-------+-----------------+------------------+
|summary|   Name|              Age|        Experience|
+-------+-------+-----------------+------------------+
|  count|      4|                4|                 4|
|   mean|   NULL|            22.75|               1.5|
| stddev|   NULL|1.707825127659933|1.2909944487358056|
|    min|   Hans|               21|                 0|
|    max|Himmler|               25|                 3|
+-------+-------+-----------------+------------------+



## DML

### Adding columns with operations
> Adding **2 years** of experience to the experience column values

In [56]:
df_exp_2 = df.withColumn('Experience After 2 Years', df['Experience'] + 2)
df_exp_2.show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience After 2 Years|
+---------+---+----------+------------------------+
|  Himmler| 23|         2|                       4|
|  Hermann| 22|         1|                       3|
|     Hans| 21|         0|                       2|
|Hellstrom| 25|         3|                       5|
+---------+---+----------+------------------------+



### Dropping columns
> Dropping the **`Experience After 2 Years`** column

In [55]:
df_exp_2 = df_exp_2.drop('Experience After 2 Years')
df_exp_2.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|  Himmler| 23|         2|
|  Hermann| 22|         1|
|     Hans| 21|         0|
|Hellstrom| 25|         3|
+---------+---+----------+



### Renaming columns

In [60]:
df_exp_2 = df_exp_2.withColumnRenamed('Name', 'First Name')
df_exp_2.show()

+----------+---+----------+------------------------+
|First Name|Age|Experience|Experience After 2 Years|
+----------+---+----------+------------------------+
|   Himmler| 23|         2|                       4|
|   Hermann| 22|         1|                       3|
|      Hans| 21|         0|                       2|
| Hellstrom| 25|         3|                       5|
+----------+---+----------+------------------------+



In [59]:
# TODO: TBD