In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import *

spark = SparkSession \
    .builder \
    .appName("Session5") \
    .getOrCreate()

In [2]:
df= spark.read.option("multiline","true").json("train_schedules.json")
df.show(4)

+-------+---+---------+------+------------+-------------------+--------------------+------------+
|arrival|day|departure|    id|station_code|       station_name|          train_name|train_number|
+-------+---+---------+------+------------+-------------------+--------------------+------------+
|   None|  1| 07:55:00|302214|          FM|KACHEGUDA FALAKNUMA|Falaknuma Lingamp...|       47154|
|   None|  1| 18:55:00|281458|         TCR|            THRISUR|Thrissur Guruvayu...|       56044|
|   None|  1| 15:05:00|309335|         PBR|          PORBANDAR|Porbandar Muzaffa...|       19269|
|   None|  1| 13:30:00|283774|           R|          RAIPUR JN|  RAIPUR ITWARI PASS|       58205|
+-------+---+---------+------+------------+-------------------+--------------------+------------+
only showing top 4 rows



In [3]:
df.select("train_number", "station_code", "departure",).show(4)

+------------+------------+---------+
|train_number|station_code|departure|
+------------+------------+---------+
|       47154|          FM| 07:55:00|
|       56044|         TCR| 18:55:00|
|       19269|         PBR| 15:05:00|
|       58205|           R| 13:30:00|
+------------+------------+---------+
only showing top 4 rows



In [98]:
df.select(df.train_number, df.station_code, df.departure).show(4)

+------------+------------+---------+
|train_number|station_code|departure|
+------------+------------+---------+
|       47154|          FM| 07:55:00|
|       56044|         TCR| 18:55:00|
|       19269|         PBR| 15:05:00|
|       58205|           R| 13:30:00|
+------------+------------+---------+
only showing top 4 rows



In [4]:
df.select(col("train_number"), col("station_code"), col("station_code")) \
.show(4)

+------------+------------+------------+
|train_number|station_code|station_code|
+------------+------------+------------+
|       47154|          FM|          FM|
|       56044|         TCR|         TCR|
|       19269|         PBR|         PBR|
|       58205|           R|           R|
+------------+------------+------------+
only showing top 4 rows



In [5]:
# Execute the same query using SQL
df.createOrReplaceTempView("schedules")
query= """
SELECT train_number, station_code, station_code
FROM schedules
LIMIT 4
"""
spark.sql(query).show()

+------------+------------+------------+
|train_number|station_code|station_code|
+------------+------------+------------+
|       47154|          FM|          FM|
|       56044|         TCR|         TCR|
|       19269|         PBR|         PBR|
|       58205|           R|           R|
+------------+------------+------------+



In [13]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/home/jovyan/work/spark-warehouse')]

In [14]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [15]:
spark.catalog.currentDatabase()

'default'

In [16]:
spark.catalog.listTables('default')

[Table(name='schedules', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [17]:
spark.catalog.listTables('global_temp')

[Table(name='schedules', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [None]:
#spark.sql('create database freblogg')

In [18]:
spark.sql('show tables from default').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |schedules|       true|
+--------+---------+-----------+



In [19]:
spark.sql('show tables from global_temp').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |schedules|       true|
+--------+---------+-----------+



In [6]:
df.printSchema()

root
 |-- arrival: string (nullable = true)
 |-- day: long (nullable = true)
 |-- departure: string (nullable = true)
 |-- id: long (nullable = true)
 |-- station_code: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- train_name: string (nullable = true)
 |-- train_number: string (nullable = true)



In [7]:
df.columns

['arrival',
 'day',
 'departure',
 'id',
 'station_code',
 'station_name',
 'train_name',
 'train_number']

In [8]:
df.groupBy("station_name").count().orderBy("station_name").show(6)

+------------------+-----+
|      station_name|count|
+------------------+-----+
|                  |    2|
|A-CABIN BONDAMUNDA|   48|
|             ABADA|  182|
|          ABHAIPUR|   56|
|  ABHAYAPURI ASSAM|   32|
|          ABJUGANJ|   14|
+------------------+-----+
only showing top 6 rows



In [9]:
df.dtypes

[('arrival', 'string'),
 ('day', 'bigint'),
 ('departure', 'string'),
 ('id', 'bigint'),
 ('station_code', 'string'),
 ('station_name', 'string'),
 ('train_name', 'string'),
 ('train_number', 'string')]

In [75]:
df= df.withColumn("departure",to_timestamp("departure"))
df= df.withColumn("arrival",to_timestamp("arrival"))

In [10]:
# Create temporary table called schedules
df.createOrReplaceTempView("schedules")
spark.sql("DESCRIBE schedules").show()

+------------+---------+-------+
|    col_name|data_type|comment|
+------------+---------+-------+
|     arrival|   string|   null|
|         day|   bigint|   null|
|   departure|   string|   null|
|          id|   bigint|   null|
|station_code|   string|   null|
|station_name|   string|   null|
|  train_name|   string|   null|
|train_number|   string|   null|
+------------+---------+-------+



In [11]:
# Adding row numbers
# Upcoming arrival time
query= """
SELECT train_number, station_code , station_name, departure, ROW_NUMBER() OVER (ORDER BY train_number) AS row_number, 
        LEAD(departure, 1) OVER (ORDER BY train_number) AS upcoming_arrival
        FROM    schedules
        WHERE train_number= 12301

"""
spark.sql(query).show(20)

+------------+------------+-----------------+---------+----------+----------------+
|train_number|station_code|     station_name|departure|row_number|upcoming_arrival|
+------------+------------+-----------------+---------+----------+----------------+
|       12301|         HWH|        HOWRAH JN| 16:55:00|         1|        16:58:00|
|       12301|         LLH|           LILUAH| 16:58:00|         2|        17:00:00|
|       12301|         BEQ|            BELUR| 17:00:00|         3|        17:01:00|
|       12301|         BLY|            BALLY| 17:01:00|         4|        17:03:00|
|       12301|         BZL|        BELANAGAR| 17:03:00|         5|        17:05:00|
|       12301|        DKAE|          DANKUNI| 17:05:00|         6|        17:07:00|
|       12301|        GBRA|            GOBRA| 17:07:00|         7|        17:10:00|
|       12301|         JOX|       JANAI ROAD| 17:10:00|         8|        17:11:00|
|       12301|        BPAE|         BEGUMPUR| 17:11:00|         9|        17

In [89]:
# Adding row numbers
# Upcoming arrival time
query= """
SELECT train_number, station_code , (UNIX_TIMESTAMP(departure, 'Yyyy-mm-dd')), 
        LEAD(departure, 1) OVER (ORDER BY train_number) AS upcoming_arrival
        FROM    schedules
        WHERE train_number= 12301

"""
spark.sql(query).show(4)

+------------+------------+-------------------------------------+-------------------+
|train_number|station_code|unix_timestamp(departure, Yyyy-mm-dd)|   upcoming_arrival|
+------------+------------+-------------------------------------+-------------------+
|       12301|         HWH|                           1619436300|2021-04-26 16:58:00|
|       12301|         LLH|                           1619436480|2021-04-26 17:00:00|
|       12301|         BEQ|                           1619436600|2021-04-26 17:01:00|
|       12301|         BLY|                           1619436660|2021-04-26 17:03:00|
+------------+------------+-------------------------------------+-------------------+
only showing top 4 rows



### Window Function

In [130]:
# OVER Clause: Adding row numbers
df.createOrReplaceTempView("schedules")
query= """
SELECT train_number, station_code , departure, ROW_NUMBER() OVER (ORDER BY train_number) AS row_number
        FROM schedules
        WHERE train_number= 12301

"""
spark.sql(query).show(5)

+------------+------------+---------+----------+
|train_number|station_code|departure|row_number|
+------------+------------+---------+----------+
|       12301|         HWH| 16:55:00|         1|
|       12301|         LLH| 16:58:00|         2|
|       12301|         BEQ| 17:00:00|         3|
|       12301|         BLY| 17:01:00|         4|
|       12301|         BZL| 17:03:00|         5|
+------------+------------+---------+----------+
only showing top 5 rows



In [132]:
# LEAD Clause: Upcoming arrival time
query= """
SELECT train_number, station_code , departure, ROW_NUMBER() OVER (ORDER BY train_number) AS row_number,
        LEAD(departure, 1) OVER (ORDER BY train_number) AS upcoming_arrival
        FROM schedules
        WHERE train_number= 12301

"""
spark.sql(query).show(5)

+------------+------------+---------+----------+----------------+
|train_number|station_code|departure|row_number|upcoming_arrival|
+------------+------------+---------+----------+----------------+
|       12301|         HWH| 16:55:00|         1|        16:58:00|
|       12301|         LLH| 16:58:00|         2|        17:00:00|
|       12301|         BEQ| 17:00:00|         3|        17:01:00|
|       12301|         BLY| 17:01:00|         4|        17:03:00|
|       12301|         BZL| 17:03:00|         5|        17:05:00|
+------------+------------+---------+----------+----------------+
only showing top 5 rows



## Doing Basic Statistics

In [88]:
from pyspark.ml.stat import *
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
credit= spark.read.csv('./data/german_credit.csv', sep= ',', header= True)

In [89]:
# Summary statistics 
num_cols = ['Account Balance','No of dependents']
credit.select(num_cols).describe().show()

+-------+------------------+-------------------+
|summary|   Account Balance|   No of dependents|
+-------+------------------+-------------------+
|  count|              1000|               1000|
|   mean|             2.577|              1.155|
| stddev|1.2576377271108936|0.36208577175319395|
|    min|                 1|                  1|
|    max|                 4|                  2|
+-------+------------------+-------------------+



In [62]:
from pyspark.sql.functions import col, skewness, kurtosis
credit.select(skewness("Age (years)"),kurtosis("Age (years)")).show()

+---------------------+---------------------+
|skewness(Age (years))|kurtosis(Age (years))|
+---------------------+---------------------+
|   1.0231743160548064|   0.6114371688367672|
+---------------------+---------------------+



In [None]:
# Correlation
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
df = spark.createDataFrame(data, ["features"])

r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

In [None]:
# Chi-Square Test
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
df = spark.createDataFrame(data, ["features"])

r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

In [None]:
# Chi-square test
# Kolmogrov Smirnov Test
# Correlation
# Multivariate Gaussian