In [1]:
!hadoop fs -ls /

Found 4 items
drwxr-xr-x   - root hadoop          0 2025-09-01 04:29 /data
drwxrwxrwt   - hdfs hadoop          0 2025-08-31 05:40 /tmp
drwxrwxrwt   - hdfs hadoop          0 2025-08-31 04:29 /user
drwxrwxrwt   - hdfs hadoop          0 2025-08-31 04:21 /var


In [2]:
!hadoop fs -ls /data

Found 1 items
-rw-r--r--   2 root hadoop    1048576 2025-09-01 04:29 /data/customers.csv


In [3]:
!hadoop fs -ls /tmp

Found 3 items
-rw-r--r--   2 nileshnandan_ts hadoop    1060750 2025-08-31 05:38 /tmp/customers1mb.csv
drwxrwxrwt   - hdfs            hadoop          0 2025-08-31 04:21 /tmp/hadoop-yarn
drwx-wx-wx   - hive            hadoop          0 2025-08-31 04:22 /tmp/hive


In [6]:
!hadoop fs -head /tmp/customers1mb.csv

customer_id,name,city,state,country,registration_date,is_active
0,Customer_0,Pune,Maharashtra,India,2023-06-29,False
1,Customer_1,Bangalore,Tamil Nadu,India,2023-12-07,True
2,Customer_2,Hyderabad,Gujarat,India,2023-10-27,True
3,Customer_3,Bangalore,Karnataka,India,2023-10-17,False
4,Customer_4,Ahmedabad,Karnataka,India,2023-03-14,False
5,Customer_5,Hyderabad,Karnataka,India,2023-07-28,False
6,Customer_6,Pune,Delhi,India,2023-08-29,False
7,Customer_7,Ahmedabad,West Bengal,India,2023-12-28,True
8,Customer_8,Pune,Karnataka,India,2023-06-22,True
9,Customer_9,Mumbai,Telangana,India,2023-01-05,True
10,Customer_10,Pune,Gujarat,India,2023-08-05,True
11,Customer_11,Delhi,West Bengal,India,2023-08-02,False
12,Customer_12,Chennai,Gujarat,India,2023-11-21,False
13,Customer_13,Chennai,Karnataka,India,2023-11-06,True
14,Customer_14,Hyderabad,Tamil Nadu,India,2023-02-07,False
15,Customer_15,Mumbai,Gujarat,India,2023-03-02,True
16,Customer_16,Chennai,Karnataka,India,2023-04-05,False
17,Customer_17,Hyd

In [4]:
spark

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("DF")\
.getOrCreate()

25/09/01 12:20:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
# df = spark.read\
# .format('csv')\
# .option('header','true')\
# .option('inferSchema', 'true')\
# .option('mode', "{}") FAILFAST, PERMISSIVE(DEFAULT), DROPMALFORMED
# .load('/tmp/customers1mb.csv')
#infer schema is heavy and sometimes might be wrong instead use StructType

from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, BooleanType, StringType

schema = StructType([
    StructField('customer_id', IntegerType(), False),
    StructField('name', StringType(), False),
    StructField('city', StringType(), False),
    StructField('state',StringType(),False),
    StructField('country', StringType(), False),
    StructField('registration_date', StringType(), False),
    StructField('is_active', BooleanType(), False)
])

#Dont leave the columns and also this can be done as a DDL command

In [8]:
df = spark.read\
.format('csv')\
.option('header','true')\
.schema(schema)\
.load('/tmp/customers1mb.csv')

In [9]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- is_active: boolean (nullable = true)



In [11]:
df.show()

                                                                                

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

In [12]:
df.columns

['customer_id',
 'name',
 'city',
 'state',
 'country',
 'registration_date',
 'is_active']

In [13]:
df.describe().show()

25/09/01 13:18:26 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+-------+-----------------+-------------+---------+-----------+-------+-----------------+
|summary|      customer_id|         name|     city|      state|country|registration_date|
+-------+-----------------+-------------+---------+-----------+-------+-----------------+
|  count|            17653|        17653|    17653|      17653|  17653|            17653|
|   mean|           8826.0|         NULL|     NULL|       NULL|   NULL|             NULL|
| stddev|5096.126486525493|         NULL|     NULL|       NULL|   NULL|             NULL|
|    min|                0|   Customer_0|Ahmedabad|      Delhi|  India|       2023-01-01|
|    max|            17652|Customer_9999|     Pune|West Bengal|  India|       2023-12-31|
+-------+-----------------+-------------+---------+-----------+-------+-----------------+



                                                                                

In [14]:
df.select('name','city').show(3)

+----------+---------+
|      name|     city|
+----------+---------+
|Customer_0|     Pune|
|Customer_1|Bangalore|
|Customer_2|Hyderabad|
+----------+---------+
only showing top 3 rows



In [16]:
df.filter(df.city=='Pune').show(3)

+-----------+----------+----+-----------+-------+-----------------+---------+
|customer_id|      name|city|      state|country|registration_date|is_active|
+-----------+----------+----+-----------+-------+-----------------+---------+
|          0|Customer_0|Pune|Maharashtra|  India|       2023-06-29|    false|
|          6|Customer_6|Pune|      Delhi|  India|       2023-08-29|    false|
|          8|Customer_8|Pune|  Karnataka|  India|       2023-06-22|     true|
+-----------+----------+----+-----------+-------+-----------------+---------+
only showing top 3 rows



In [17]:
df.where(df.state=='Delhi').show(3)

+-----------+-----------+-----+-----+-------+-----------------+---------+
|customer_id|       name| city|state|country|registration_date|is_active|
+-----------+-----------+-----+-----+-------+-----------------+---------+
|          6| Customer_6| Pune|Delhi|  India|       2023-08-29|    false|
|         18|Customer_18| Pune|Delhi|  India|       2023-10-04|     true|
|         26|Customer_26|Delhi|Delhi|  India|       2023-03-22|     true|
+-----------+-----------+-----+-----+-------+-----------------+---------+
only showing top 3 rows



In [18]:
df.select('state').distinct().show()

+-----------+
|      state|
+-----------+
|    Gujarat|
|      Delhi|
|  Karnataka|
|  Telangana|
|Maharashtra|
| Tamil Nadu|
|West Bengal|
+-----------+



In [19]:
df.groupby('state').count().show()

+-----------+-----+
|      state|count|
+-----------+-----+
|    Gujarat| 2543|
|      Delhi| 2578|
|  Karnataka| 2483|
|  Telangana| 2520|
|Maharashtra| 2490|
| Tamil Nadu| 2536|
|West Bengal| 2503|
+-----------+-----+



In [20]:
#df.join

In [21]:
spark.stop()