In [1]:
import pyspark
import pandas as pd
from pyspark.sql import functions as F

In [2]:
# start a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Demo').getOrCreate()

In [3]:
spark

In [4]:
# to read csv using spark
df_pyspark = spark.read.csv('train.csv')

In [5]:
# to show data
df_pyspark.show()

+---------------+-----+--------------+-------+------------------+--------------------+------+
|            _c0|  _c1|           _c2|    _c3|               _c4|                 _c5|   _c6|
+---------------+-----+--------------+-------+------------------+--------------------+------+
|         row_id|cfips|        county|  state|first_day_of_month|microbusiness_den...|active|
|1001_2019-08-01| 1001|Autauga County|Alabama|        2019-08-01|           3.0076818|  1249|
|1001_2019-09-01| 1001|Autauga County|Alabama|        2019-09-01|           2.8848701|  1198|
|1001_2019-10-01| 1001|Autauga County|Alabama|        2019-10-01|           3.0558431|  1269|
|1001_2019-11-01| 1001|Autauga County|Alabama|        2019-11-01|           2.9932332|  1243|
|1001_2019-12-01| 1001|Autauga County|Alabama|        2019-12-01|           2.9932332|  1243|
|1001_2020-01-01| 1001|Autauga County|Alabama|        2020-01-01|             2.96909|  1242|
|1001_2020-02-01| 1001|Autauga County|Alabama|        2020-0

In [6]:
# to show header
df_pyspark = spark.read.option('header','true').csv('train.csv')

In [7]:
# to show data
df_pyspark.show()

+---------------+-----+--------------+-------+------------------+---------------------+------+
|         row_id|cfips|        county|  state|first_day_of_month|microbusiness_density|active|
+---------------+-----+--------------+-------+------------------+---------------------+------+
|1001_2019-08-01| 1001|Autauga County|Alabama|        2019-08-01|            3.0076818|  1249|
|1001_2019-09-01| 1001|Autauga County|Alabama|        2019-09-01|            2.8848701|  1198|
|1001_2019-10-01| 1001|Autauga County|Alabama|        2019-10-01|            3.0558431|  1269|
|1001_2019-11-01| 1001|Autauga County|Alabama|        2019-11-01|            2.9932332|  1243|
|1001_2019-12-01| 1001|Autauga County|Alabama|        2019-12-01|            2.9932332|  1243|
|1001_2020-01-01| 1001|Autauga County|Alabama|        2020-01-01|              2.96909|  1242|
|1001_2020-02-01| 1001|Autauga County|Alabama|        2020-02-01|            2.9093256|  1217|
|1001_2020-03-01| 1001|Autauga County|Alabama|    

In [8]:
# same like df.info
df_pyspark.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- cfips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- first_day_of_month: string (nullable = true)
 |-- microbusiness_density: string (nullable = true)
 |-- active: string (nullable = true)



In [45]:
# to read csv with setting inferSchemar=True with correct schema columns provided
df_pyspark = spark.read.csv('train.csv',header=True,inferSchema=True)

In [46]:
# same like df.info
df_pyspark.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- cfips: integer (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- first_day_of_month: timestamp (nullable = true)
 |-- microbusiness_density: double (nullable = true)
 |-- active: integer (nullable = true)



In [47]:
# select only 2 columns
df_pyspark.select(['state','first_day_of_month'])

DataFrame[state: string, first_day_of_month: timestamp]

In [48]:
# type of columns name
type(df_pyspark.select(['state','first_day_of_month']))

pyspark.sql.dataframe.DataFrame

In [49]:
# show only columns selected
df_pyspark.select(['state','first_day_of_month']).show()

+-------+-------------------+
|  state| first_day_of_month|
+-------+-------------------+
|Alabama|2019-08-01 00:00:00|
|Alabama|2019-09-01 00:00:00|
|Alabama|2019-10-01 00:00:00|
|Alabama|2019-11-01 00:00:00|
|Alabama|2019-12-01 00:00:00|
|Alabama|2020-01-01 00:00:00|
|Alabama|2020-02-01 00:00:00|
|Alabama|2020-03-01 00:00:00|
|Alabama|2020-04-01 00:00:00|
|Alabama|2020-05-01 00:00:00|
|Alabama|2020-06-01 00:00:00|
|Alabama|2020-07-01 00:00:00|
|Alabama|2020-08-01 00:00:00|
|Alabama|2020-09-01 00:00:00|
|Alabama|2020-10-01 00:00:00|
|Alabama|2020-11-01 00:00:00|
|Alabama|2020-12-01 00:00:00|
|Alabama|2021-01-01 00:00:00|
|Alabama|2021-02-01 00:00:00|
|Alabama|2021-03-01 00:00:00|
+-------+-------------------+
only showing top 20 rows



In [50]:
# show statistic of the data
df_pyspark.describe().show()

+-------+----------------+------------------+----------------+-------+---------------------+-----------------+
|summary|          row_id|             cfips|          county|  state|microbusiness_density|           active|
+-------+----------------+------------------+----------------+-------+---------------------+-----------------+
|  count|          122265|            122265|          122265| 122265|               122265|           122265|
|   mean|            null| 30376.03763955343|            null|   null|   3.8176705691469994|6442.858217805586|
| stddev|            null|15143.508721389686|            null|   null|    4.991086802428138|33040.01249296402|
|    min|10001_2019-08-01|              1001|Abbeville County|Alabama|                  0.0|                0|
|    max| 9015_2022-10-01|             56045|  Ziebach County|Wyoming|            284.34003|          1167744|
+-------+----------------+------------------+----------------+-------+---------------------+-----------------+



In [54]:
# add a new column with name next month, by adding 1 month into first_day_of_month
months_to_add = 1 
df_pyspark = df_pyspark.withColumn("next_month", F.add_months("first_day_of_month", months_to_add))

In [55]:
df_pyspark.show()

+---------------+-----+--------------+-------+-------------------+---------------------+------+----------+
|         row_id|cfips|        county|  state| first_day_of_month|microbusiness_density|active|next_month|
+---------------+-----+--------------+-------+-------------------+---------------------+------+----------+
|1001_2019-08-01| 1001|Autauga County|Alabama|2019-08-01 00:00:00|            3.0076818|  1249|2019-09-01|
|1001_2019-09-01| 1001|Autauga County|Alabama|2019-09-01 00:00:00|            2.8848701|  1198|2019-10-01|
|1001_2019-10-01| 1001|Autauga County|Alabama|2019-10-01 00:00:00|            3.0558431|  1269|2019-11-01|
|1001_2019-11-01| 1001|Autauga County|Alabama|2019-11-01 00:00:00|            2.9932332|  1243|2019-12-01|
|1001_2019-12-01| 1001|Autauga County|Alabama|2019-12-01 00:00:00|            2.9932332|  1243|2020-01-01|
|1001_2020-01-01| 1001|Autauga County|Alabama|2020-01-01 00:00:00|              2.96909|  1242|2020-02-01|
|1001_2020-02-01| 1001|Autauga County

In [58]:
# drop the columns
df_pyspark.drop('next_month').show()

+---------------+-----+--------------+-------+-------------------+---------------------+------+
|         row_id|cfips|        county|  state| first_day_of_month|microbusiness_density|active|
+---------------+-----+--------------+-------+-------------------+---------------------+------+
|1001_2019-08-01| 1001|Autauga County|Alabama|2019-08-01 00:00:00|            3.0076818|  1249|
|1001_2019-09-01| 1001|Autauga County|Alabama|2019-09-01 00:00:00|            2.8848701|  1198|
|1001_2019-10-01| 1001|Autauga County|Alabama|2019-10-01 00:00:00|            3.0558431|  1269|
|1001_2019-11-01| 1001|Autauga County|Alabama|2019-11-01 00:00:00|            2.9932332|  1243|
|1001_2019-12-01| 1001|Autauga County|Alabama|2019-12-01 00:00:00|            2.9932332|  1243|
|1001_2020-01-01| 1001|Autauga County|Alabama|2020-01-01 00:00:00|              2.96909|  1242|
|1001_2020-02-01| 1001|Autauga County|Alabama|2020-02-01 00:00:00|            2.9093256|  1217|
|1001_2020-03-01| 1001|Autauga County|Al

In [60]:
# rename the columns
df_pyspark.withColumnRenamed('row_id','id').show()

+---------------+-----+--------------+-------+-------------------+---------------------+------+----------+
|             id|cfips|        county|  state| first_day_of_month|microbusiness_density|active|next_month|
+---------------+-----+--------------+-------+-------------------+---------------------+------+----------+
|1001_2019-08-01| 1001|Autauga County|Alabama|2019-08-01 00:00:00|            3.0076818|  1249|2019-09-01|
|1001_2019-09-01| 1001|Autauga County|Alabama|2019-09-01 00:00:00|            2.8848701|  1198|2019-10-01|
|1001_2019-10-01| 1001|Autauga County|Alabama|2019-10-01 00:00:00|            3.0558431|  1269|2019-11-01|
|1001_2019-11-01| 1001|Autauga County|Alabama|2019-11-01 00:00:00|            2.9932332|  1243|2019-12-01|
|1001_2019-12-01| 1001|Autauga County|Alabama|2019-12-01 00:00:00|            2.9932332|  1243|2020-01-01|
|1001_2020-01-01| 1001|Autauga County|Alabama|2020-01-01 00:00:00|              2.96909|  1242|2020-02-01|
|1001_2020-02-01| 1001|Autauga County