In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder.appName("test")
         .master("local[*]")
         .config("spark.executor.memory", "1g")
         .getOrCreate()
         )

In [3]:
data = [
    (0, "Customer_0", "Bangalore", "Karnataka", "India", "2023-11-11", True),
    (1, "Customer_1", "Delhi", "Delhi", "India", "2023-08-26", True),
]

columns = ["customer_id", "name", "city", "state", "country", "registration_date", "is_active"]

In [4]:
df = spark.createDataFrame(data, columns)

In [5]:
df.show()

+-----------+----------+---------+---------+-------+-----------------+---------+
|customer_id|      name|     city|    state|country|registration_date|is_active|
+-----------+----------+---------+---------+-------+-----------------+---------+
|          0|Customer_0|Bangalore|Karnataka|  India|       2023-11-11|     true|
|          1|Customer_1|    Delhi|    Delhi|  India|       2023-08-26|     true|
+-----------+----------+---------+---------+-------+-----------------+---------+



In [7]:
df.select('name').show()

+----------+
|      name|
+----------+
|Customer_0|
|Customer_1|
+----------+



In [9]:
df_2 = (spark.read.format('csv')
        .option('header', 'true')
        .option('inferSchema', 'true')
        .load('../Data/customers.csv'))

In [10]:
df_2.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|West Bengal|  India|       2023-10-10|     true|
|          1| Customer_1|Bangalore|    Gujarat|  India|       2023-10-19|    false|
|          2| Customer_2|Bangalore|  Karnataka|  India|       2023-02-10|     true|
|          3| Customer_3|Bangalore|  Telangana|  India|       2023-03-24|     true|
|          4| Customer_4|Hyderabad|  Telangana|  India|       2023-06-04|    false|
|          5| Customer_5|Hyderabad|West Bengal|  India|       2023-07-26|     true|
|          6| Customer_6|Hyderabad|  Karnataka|  India|       2023-08-07|    false|
|          7| Customer_7|Bangalore|  Telangana|  India|       2023-08-25|     true|
|          8| Customer_8|Bangalore|Maharashtra|  India|       2023-07-13|   

In [11]:
df_2.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [12]:
active_customers = df_2.filter('is_active == True')

In [13]:
active_customers

DataFrame[customer_id: int, name: string, city: string, state: string, country: string, registration_date: date, is_active: boolean]

In [14]:
active_customers.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|West Bengal|  India|       2023-10-10|     true|
|          2| Customer_2|Bangalore|  Karnataka|  India|       2023-02-10|     true|
|          3| Customer_3|Bangalore|  Telangana|  India|       2023-03-24|     true|
|          5| Customer_5|Hyderabad|West Bengal|  India|       2023-07-26|     true|
|          7| Customer_7|Bangalore|  Telangana|  India|       2023-08-25|     true|
|         11|Customer_11|     Pune|West Bengal|  India|       2023-04-10|     true|
|         12|Customer_12|    Delhi| Tamil Nadu|  India|       2023-06-20|     true|
|         13|Customer_13|Ahmedabad|      Delhi|  India|       2023-11-09|     true|
|         14|Customer_14|Ahmedabad|West Bengal|  India|       2023-02-18|   

In [15]:
selected_columns = df_2.select('customer_id', 'name', 'city')

In [16]:
selected_columns

DataFrame[customer_id: int, name: string, city: string]

In [17]:
selected_columns.show()

+-----------+-----------+---------+
|customer_id|       name|     city|
+-----------+-----------+---------+
|          0| Customer_0|     Pune|
|          1| Customer_1|Bangalore|
|          2| Customer_2|Bangalore|
|          3| Customer_3|Bangalore|
|          4| Customer_4|Hyderabad|
|          5| Customer_5|Hyderabad|
|          6| Customer_6|Hyderabad|
|          7| Customer_7|Bangalore|
|          8| Customer_8|Bangalore|
|          9| Customer_9|Ahmedabad|
|         10|Customer_10|  Chennai|
|         11|Customer_11|     Pune|
|         12|Customer_12|    Delhi|
|         13|Customer_13|Ahmedabad|
|         14|Customer_14|Ahmedabad|
|         15|Customer_15|   Mumbai|
|         16|Customer_16|    Delhi|
|         17|Customer_17|  Chennai|
|         18|Customer_18|Ahmedabad|
|         19|Customer_19|    Delhi|
+-----------+-----------+---------+
only showing top 20 rows

