### Topics
    * PySpark DataFrame
    * Reading the Dataset
    * Datatypes of the column
    * Selecting Columns & Indexing
    * Describe option similar to pandas
    * Adding Columns
    * Dropping Columns
    * Renaming Columns

In [2]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession \
        .builder \
        .appName('DataFrame').getOrCreate()

spark

In [9]:
"""
* By default spark treat input values as string.
But you can set "inferSchema=True" to set input datatypes dynamically.
"""

# read the dataset - way no. 01
dataset = spark \
        .read \
        .option('header', 'true') \
        .csv('test_dataset.csv', inferSchema=True)

dataset.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Tarek| 23|         5|
| Forid| 24|         8|
| Ridoy| 24|        10|
| Imran| 25|         7|
|Saiful| 27|         4|
+------+---+----------+



In [10]:
# check the schema/datatypes
dataset.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [28]:
# read the dataset - way no. 02
dataset = spark.read.csv('test_dataset.csv', header=True, inferSchema=True)
dataset.show()
dataset.printSchema()
print(type(dataset))

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Tarek| 23|         5|
| Forid| 24|         8|
| Ridoy| 24|        10|
| Imran| 25|         7|
|Saiful| 27|         4|
+------+---+----------+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)

<class 'pyspark.sql.dataframe.DataFrame'>


In [16]:
# check column names
dataset.columns

['Name', 'Age', 'Experience']

In [17]:
# print first 3 data's rows
dataset.head(3)

[Row(Name='Tarek', Age=23, Experience=5),
 Row(Name='Forid', Age=24, Experience=8),
 Row(Name='Ridoy', Age=24, Experience=10)]

In [18]:
dataset.show(3)

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Tarek| 23|         5|
|Forid| 24|         8|
|Ridoy| 24|        10|
+-----+---+----------+
only showing top 3 rows



In [19]:
# select specific column values
dataset.select('Name').show()

+------+
|  Name|
+------+
| Tarek|
| Forid|
| Ridoy|
| Imran|
|Saiful|
+------+



In [20]:
# select multiple column values
dataset.select(['Name', 'Experience']).show()

+------+----------+
|  Name|Experience|
+------+----------+
| Tarek|         5|
| Forid|         8|
| Ridoy|        10|
| Imran|         7|
|Saiful|         4|
+------+----------+



In [22]:
# datatypes
dataset.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [24]:
# describe of dataset
dataset.describe().show()

+-------+-----+----------------+------------------+
|summary| Name|             Age|        Experience|
+-------+-----+----------------+------------------+
|  count|    5|               5|                 5|
|   mean| null|            24.6|               6.8|
| stddev| null|1.51657508881031|2.3874672772626644|
|    min|Forid|              23|                 4|
|    max|Tarek|              27|                10|
+-------+-----+----------------+------------------+



In [36]:
# adding columns
dataset2 = dataset.withColumn('Experience After 2 years', dataset['Experience']+2)
dataset2.show()
dataset2.printSchema()

+------+---+----------+------------------------+
|  Name|Age|Experience|Experience After 2 years|
+------+---+----------+------------------------+
| Tarek| 23|         5|                       7|
| Forid| 24|         8|                      10|
| Ridoy| 24|        10|                      12|
| Imran| 25|         7|                       9|
|Saiful| 27|         4|                       6|
+------+---+----------+------------------------+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Experience After 2 years: integer (nullable = true)



In [37]:
# Drop the column 
dataset3 = dataset2.drop('Experience After 2 years')
dataset3.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Tarek| 23|         5|
| Forid| 24|         8|
| Ridoy| 24|        10|
| Imran| 25|         7|
|Saiful| 27|         4|
+------+---+----------+



In [40]:
# Rename the columns
dataset4 = dataset.withColumnRenamed('Name', 'New Names')
dataset4.show()

+---------+---+----------+
|New Names|Age|Experience|
+---------+---+----------+
|    Tarek| 23|         5|
|    Forid| 24|         8|
|    Ridoy| 24|        10|
|    Imran| 25|         7|
|   Saiful| 27|         4|
+---------+---+----------+

