Installing dependencies

In [None]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
f


In [2]:
# create spark session
findspark.init()
spark = SparkSession.builder.appName("small_project_pyspark").getOrCreate()
spark

In [11]:
ct_df=spark.read.csv('C:\spark_training\Small_Project\components\data\dim_customers.csv',inferSchema=True, header=True)

In [12]:
ct_df.printSchema()
ct_df.show()

root
 |-- customer_id: string (nullable = true)
 |-- age_group: string (nullable = true)
 |-- city: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- marital status: string (nullable = true)
 |-- avg_income: integer (nullable = true)

+-----------+---------+---------+--------------------+------+--------------+----------+
|customer_id|age_group|     city|          occupation|gender|marital status|avg_income|
+-----------+---------+---------+--------------------+------+--------------+----------+
| ATQCUS1825|      45+|Bengaluru|Salaried IT Emplo...|  Male|       Married|     73523|
| ATQCUS0809|    25-34|Hyderabad|Salaried Other Em...|  Male|       Married|     39922|
| ATQCUS0663|    25-34|  Chennai|Salaried Other Em...|  Male|       Married|     37702|
| ATQCUS0452|    25-34|Delhi NCR|Government Employees|  Male|       Married|     54090|
| ATQCUS3350|    21-24|Bengaluru|         Freelancers|  Male|        Single|     28376|
|

In [None]:
ct_df.filter(col("avg_income").isNull()).show() 
# there are no null values in income column 

+-----------+---------+----+----------+------+--------------+----------+
|customer_id|age_group|city|occupation|gender|marital status|avg_income|
+-----------+---------+----+----------+------+--------------+----------+
+-----------+---------+----+----------+------+--------------+----------+



In [18]:
# renaming the column name
ct_df.withColumnRenamed('marital status', 'marital_status').show()

+-----------+---------+---------+--------------------+------+--------------+----------+
|customer_id|age_group|     city|          occupation|gender|marital_status|avg_income|
+-----------+---------+---------+--------------------+------+--------------+----------+
| ATQCUS1825|      45+|Bengaluru|Salaried IT Emplo...|  Male|       Married|     73523|
| ATQCUS0809|    25-34|Hyderabad|Salaried Other Em...|  Male|       Married|     39922|
| ATQCUS0663|    25-34|  Chennai|Salaried Other Em...|  Male|       Married|     37702|
| ATQCUS0452|    25-34|Delhi NCR|Government Employees|  Male|       Married|     54090|
| ATQCUS3350|    21-24|Bengaluru|         Freelancers|  Male|        Single|     28376|
| ATQCUS3256|    21-24|Delhi NCR|Salaried IT Emplo...|  Male|        Single|     46586|
| ATQCUS3608|    25-34|  Chennai|         Freelancers|  Male|        Single|     34966|
| ATQCUS0611|    25-34|  Chennai|Salaried IT Emplo...|  Male|       Married|     59078|
| ATQCUS3856|    21-24|Bengaluru

In [21]:
from pyspark.sql.functions import lit
from datetime import datetime

new_ct_df = ct_df.withColumn("timestamp", lit(datetime.now())).withColumn("net_salary", col("avg_income") + (0.1 * col("avg_income")))
new_ct_df.show()

+-----------+---------+---------+--------------------+------+--------------+----------+--------------------+----------+
|customer_id|age_group|     city|          occupation|gender|marital status|avg_income|           timestamp|net_salary|
+-----------+---------+---------+--------------------+------+--------------+----------+--------------------+----------+
| ATQCUS1825|      45+|Bengaluru|Salaried IT Emplo...|  Male|       Married|     73523|2025-03-06 18:50:...|   80875.3|
| ATQCUS0809|    25-34|Hyderabad|Salaried Other Em...|  Male|       Married|     39922|2025-03-06 18:50:...|   43914.2|
| ATQCUS0663|    25-34|  Chennai|Salaried Other Em...|  Male|       Married|     37702|2025-03-06 18:50:...|   41472.2|
| ATQCUS0452|    25-34|Delhi NCR|Government Employees|  Male|       Married|     54090|2025-03-06 18:50:...|   59499.0|
| ATQCUS3350|    21-24|Bengaluru|         Freelancers|  Male|        Single|     28376|2025-03-06 18:50:...|   31213.6|
| ATQCUS3256|    21-24|Delhi NCR|Salarie

In [33]:
from pyspark.sql.functions import window, rank

In [32]:
CityPartition = Window.partitionBy('city').orderBy('net_salary')
OccupationPartition = Window.partitionBy('occupation').orderBy('net_salary')

In [35]:
# ranking on city partition
new_ct_df_cityrank = new_ct_df.withColumn('rank', rank().over(CityPartition))
new_ct_df_cityrank.show()

+-----------+---------+---------+-----------+------+--------------+----------+--------------------+----------+----+
|customer_id|age_group|     city| occupation|gender|marital status|avg_income|           timestamp|net_salary|rank|
+-----------+---------+---------+-----------+------+--------------+----------+--------------------+----------+----+
| ATQCUS3345|    21-24|Bengaluru|Freelancers|  Male|        Single|     24888|2025-03-06 18:50:...|   27376.8|   1|
| ATQCUS2053|    21-24|Bengaluru|Freelancers|Female|       Married|     24995|2025-03-06 18:50:...|   27494.5|   2|
| ATQCUS0082|    21-24|Bengaluru|Freelancers|  Male|       Married|     25301|2025-03-06 18:50:...|   27831.1|   3|
| ATQCUS3864|    21-24|Bengaluru|Freelancers|Female|        Single|     25537|2025-03-06 18:50:...|   28090.7|   4|
| ATQCUS3347|    21-24|Bengaluru|Freelancers|  Male|        Single|     25653|2025-03-06 18:50:...|   28218.3|   5|
| ATQCUS3859|    21-24|Bengaluru|Freelancers|Female|        Single|     

In [36]:
# ranking on occupation partition 
new_ct_df_occupationrank = new_ct_df.withColumn('rank', rank().over(OccupationPartition))
new_ct_df_occupationrank.show()

+-----------+---------+---------+---------------+------+--------------+----------+--------------------+----------+----+
|customer_id|age_group|     city|     occupation|gender|marital status|avg_income|           timestamp|net_salary|rank|
+-----------+---------+---------+---------------+------+--------------+----------+--------------------+----------+----+
| ATQCUS0075|    21-24|Bengaluru|Business Owners|  Male|       Married|     49549|2025-03-06 18:50:...|   54503.9|   1|
| ATQCUS3207|    21-24|   Mumbai|Business Owners|  Male|        Single|     50101|2025-03-06 18:50:...|   55111.1|   2|
| ATQCUS3790|    21-24|   Mumbai|Business Owners|Female|        Single|     50353|2025-03-06 18:50:...|   55388.3|   3|
| ATQCUS3216|    21-24|   Mumbai|Business Owners|  Male|        Single|     50647|2025-03-06 18:50:...|   55711.7|   4|
| ATQCUS3336|    21-24|Bengaluru|Business Owners|  Male|        Single|     50673|2025-03-06 18:50:...|   55740.3|   5|
| ATQCUS3285|    21-24|Delhi NCR|Busines

In [37]:
challenge_1=new_ct_df.write.mode("overwrite").parquet("C:\\spark_training\\Small_Project\\components\\challenge_file.parquet")

7. Matplotlib

In [None]:
import matplotlib.pyplot as plt