# Agenda

- InferSchema
- schema
    - printSchema
    - columns
- `describe().show()`
- Select Specific column `select`
- withColumn
- SQL Integration
    - createOrReplaceTempView

In [2]:
import findspark
findspark.init('/home/navin/spark-2.4.6-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [11]:
spark=SparkSession.builder.appName('Basics').getOrCreate()

# Note : Error (read file from Local)
- The error you're encountering, **Py4JJavaError**, indicates that Spark is trying to access the file employee_data.csv using HDFS (Hadoop Distributed File System) instead of the local file system.
- Explicitly tell Spark to read from the local file system by using the file:// prefix.

In [13]:
df=spark.read.csv('file:///home/navin/Documents/employee_data.csv', header=True, inferSchema=True)

# InferSchema=True

- automatically detect the column data types(e.g., integers, doubles, strings)
- without the need for manual specification

In [14]:
df.show()

+---+--------------+---+-----------+------+
| id|          name|age| department|salary|
+---+--------------+---+-----------+------+
|  1|      John Doe| 30|Engineering| 70000|
|  2|    Jane Smith| 25|  Marketing| 50000|
|  3|   Bob Johnson| 40|      Sales| 60000|
|  4|Alice Williams| 35|Engineering| 80000|
|  5| Charlie Brown| 28|  Marketing| 55000|
|  6|  David Wilson| 45|      Sales| 75000|
|  7|     Eva Davis| 32|         HR| 62000|
|  8|   Frank Moore| 50|Engineering| 90000|
|  9|     Grace Lee| 29|  Marketing| 48000|
| 10|  Hannah White| 33|      Sales| 65000|
+---+--------------+---+-----------+------+



In [16]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)



In [17]:
df.columns

['id', 'name', 'age', 'department', 'salary']

In [18]:
df.describe().show()

+-------+------------------+--------------+-----------------+-----------+-----------------+
|summary|                id|          name|              age| department|           salary|
+-------+------------------+--------------+-----------------+-----------+-----------------+
|  count|                10|            10|               10|         10|               10|
|   mean|               5.5|          null|             34.7|       null|          65500.0|
| stddev|3.0276503540974917|          null|7.972870666621066|       null|13418.47813858023|
|    min|                 1|Alice Williams|               25|Engineering|            48000|
|    max|                10|      John Doe|               50|      Sales|            90000|
+-------+------------------+--------------+-----------------+-----------+-----------------+



In [19]:
type(df['age'])

pyspark.sql.column.Column

In [25]:
df.head(2)

[Row(id=1, name='John Doe', age=30, department='Engineering', salary=70000),
 Row(id=2, name='Jane Smith', age=25, department='Marketing', salary=50000)]

In [30]:
df.head(2)[0]

Row(id=1, name='John Doe', age=30, department='Engineering', salary=70000)

In [29]:
type(df.head(2)[0])

pyspark.sql.types.Row

In [31]:
# Select Multiple columns

df.select(['id','name'])

DataFrame[id: int, name: string]

In [33]:
df.select(['id','name','age']).show()

+---+--------------+---+
| id|          name|age|
+---+--------------+---+
|  1|      John Doe| 30|
|  2|    Jane Smith| 25|
|  3|   Bob Johnson| 40|
|  4|Alice Williams| 35|
|  5| Charlie Brown| 28|
|  6|  David Wilson| 45|
|  7|     Eva Davis| 32|
|  8|   Frank Moore| 50|
|  9|     Grace Lee| 29|
| 10|  Hannah White| 33|
+---+--------------+---+



In [35]:
# Transformation - newcolumn - derive based on the exiting column
df.withColumn('newAge', df['age']*2).show()

+---+--------------+---+-----------+------+------+
| id|          name|age| department|salary|newAge|
+---+--------------+---+-----------+------+------+
|  1|      John Doe| 30|Engineering| 70000|    60|
|  2|    Jane Smith| 25|  Marketing| 50000|    50|
|  3|   Bob Johnson| 40|      Sales| 60000|    80|
|  4|Alice Williams| 35|Engineering| 80000|    70|
|  5| Charlie Brown| 28|  Marketing| 55000|    56|
|  6|  David Wilson| 45|      Sales| 75000|    90|
|  7|     Eva Davis| 32|         HR| 62000|    64|
|  8|   Frank Moore| 50|Engineering| 90000|   100|
|  9|     Grace Lee| 29|  Marketing| 48000|    58|
| 10|  Hannah White| 33|      Sales| 65000|    66|
+---+--------------+---+-----------+------+------+



In [37]:
df.withColumnRenamed('age', 'myNewAge').show()

+---+--------------+--------+-----------+------+
| id|          name|myNewAge| department|salary|
+---+--------------+--------+-----------+------+
|  1|      John Doe|      30|Engineering| 70000|
|  2|    Jane Smith|      25|  Marketing| 50000|
|  3|   Bob Johnson|      40|      Sales| 60000|
|  4|Alice Williams|      35|Engineering| 80000|
|  5| Charlie Brown|      28|  Marketing| 55000|
|  6|  David Wilson|      45|      Sales| 75000|
|  7|     Eva Davis|      32|         HR| 62000|
|  8|   Frank Moore|      50|Engineering| 90000|
|  9|     Grace Lee|      29|  Marketing| 48000|
| 10|  Hannah White|      33|      Sales| 65000|
+---+--------------+--------+-----------+------+



# SQL

In [38]:
df.createOrReplaceTempView("emp")

In [44]:
results=spark.sql("SELECT * FROM emp WHERE salary >60000")

In [45]:
results.show()

+---+--------------+---+-----------+------+
| id|          name|age| department|salary|
+---+--------------+---+-----------+------+
|  1|      John Doe| 30|Engineering| 70000|
|  4|Alice Williams| 35|Engineering| 80000|
|  6|  David Wilson| 45|      Sales| 75000|
|  7|     Eva Davis| 32|         HR| 62000|
|  8|   Frank Moore| 50|Engineering| 90000|
| 10|  Hannah White| 33|      Sales| 65000|
+---+--------------+---+-----------+------+



In [46]:
# Stop the SparkSession
spark.stop()