In [5]:
sc

In [6]:
spark

In [7]:
bank_customer_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json",multiLine=True)
bank_customer_data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [8]:
bank_customer_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



### 1.Display Max, MIn and Mean age of the targetted customer 

In [9]:
bank_customer_data.createOrReplaceTempView("banktable")

In [10]:
from pyspark.sql.types import *
from pyspark.sql.functions import *


In [11]:
bank_customer_data.select(max("age")).show()

+--------+
|max(age)|
+--------+
|      95|
+--------+



In [12]:
bank_customer_data.select(min("age")).show()

+--------+
|min(age)|
+--------+
|      18|
+--------+



In [13]:
bank_customer_data.select(round(mean("age"))).show()

+------------------+
|round(avg(age), 0)|
+------------------+
|              41.0|
+------------------+



In [14]:
# using spark sql
spark.sql("select max(age),min(age),mean(age) from banktable").show()

+--------+--------+-----------------+
|max(age)|min(age)|         avg(age)|
+--------+--------+-----------------+
|      95|      18|40.93621021432837|
+--------+--------+-----------------+



### 2. Check the quality of customer by analysing targeted customer with their min,max,mean,median balance

In [15]:
spark.sql("select max(balance),min(abs(balance)) as minimum_balance,mean(balance),percentile_approx(balance,0.5) as median_balance from banktable").show()

+------------+---------------+------------------+--------------+
|max(balance)|minimum_balance|      avg(balance)|median_balance|
+------------+---------------+------------------+--------------+
|      102127|              0|1362.2720576850766|           448|
+------------+---------------+------------------+--------------+



### 3 Check if age matters in marketing subscription for Term Deposit scheme

In [16]:
spark.sql("select age,count(*) as customer_count from banktable where y = 'yes' group by age order by customer_count desc").show()

+---+--------------+
|age|customer_count|
+---+--------------+
| 32|           221|
| 30|           217|
| 33|           210|
| 35|           209|
| 31|           206|
| 34|           198|
| 36|           195|
| 29|           171|
| 37|           170|
| 28|           162|
| 38|           144|
| 39|           143|
| 27|           141|
| 26|           134|
| 41|           120|
| 46|           118|
| 40|           116|
| 25|           113|
| 47|           113|
| 42|           111|
+---+--------------+
only showing top 20 rows



### 4. Calculate Marketing success rate and Failure rate

In [17]:
spark.sql("select round((select count(*) from banktable where y = 'yes')*100/(count(*)),3) as success_rate from banktable").show()

+------------+
|success_rate|
+------------+
|      11.698|
+------------+



In [18]:
spark.sql("select round((select count(*) from banktable where y = 'no')*100/(count(*)),3) as failure_rate from banktable").show()

+------------+
|failure_rate|
+------------+
|      88.302|
+------------+



### 5. if marital status matters in marketing subscription for term deposit scheme 

In [19]:
#5

spark.sql("""
select marital,count(*) as maritalsuccess 
from banktable 
where y='yes' 
group by marital 
order by maritalsuccess desc""").show()


+--------+--------------+
| marital|maritalsuccess|
+--------+--------------+
| married|          2755|
|  single|          1912|
|divorced|           622|
+--------+--------------+



In [20]:
#6 check if age and marital status matter together for subscription to term deposit scheme
spark.sql("""
select age,marital,count(*) as success from banktable 
where y='yes' 
group by age, marital 
order by success desc
""").show()



+---+-------+-------+
|age|marital|success|
+---+-------+-------+
| 30| single|    151|
| 28| single|    138|
| 29| single|    133|
| 32| single|    124|
| 26| single|    121|
| 34|married|    118|
| 31| single|    111|
| 27| single|    110|
| 35|married|    101|
| 36|married|    100|
| 25| single|     99|
| 37|married|     98|
| 33|married|     97|
| 33| single|     97|
| 32|married|     87|
| 39|married|     87|
| 38|married|     86|
| 35| single|     84|
| 47|married|     83|
| 31|married|     80|
+---+-------+-------+
only showing top 20 rows



### 7.compute success rate of each age and marital status category group by age marital status

In [21]:
#7. 
spark.sql("""
    select age, marital, 100.0 * sum(case when y = 'yes' then 1 else 0 end) / COUNT(*) AS success_rate
    FROM banktable
    GROUP BY age, marital
    ORDER BY success_rate DESC
""").show()


+---+--------+------------------+
|age| marital|      success_rate|
+---+--------+------------------+
| 93| married|100.00000000000000|
| 95|divorced|100.00000000000000|
| 68|divorced|100.00000000000000|
| 92| married|100.00000000000000|
| 85|divorced|100.00000000000000|
| 90|divorced|100.00000000000000|
| 87|divorced|100.00000000000000|
| 86|  single|100.00000000000000|
| 67|divorced| 87.50000000000000|
| 62|divorced| 83.33333333333333|
| 85| married| 75.00000000000000|
| 76|divorced| 75.00000000000000|
| 71|divorced| 72.72727272727273|
| 87| married| 66.66666666666667|
| 73|divorced| 66.66666666666667|
| 84| married| 66.66666666666667|
| 77|divorced| 60.00000000000000|
| 18|  single| 58.33333333333333|
| 63|divorced| 57.14285714285714|
| 73| married| 52.77777777777778|
+---+--------+------------------+
only showing top 20 rows



### 8 Do feature engineering for Bank Investment scheme and find effect  of age on Campaign

In [22]:
spark.sql("""
select age_category,count(*) as success_count from(select case when age < 25 then 'Teenager'
            when age >= 25 and age <=35 then 'Adult'
            when age >=35 and age<=55 then 'MiddleAged'
            else 'Old' 
        end as age_category
from banktable where y = 'yes') group by age_category 
order by success_count desc
""").show()

+------------+-------------+
|age_category|success_count|
+------------+-------------+
|  MiddleAged|         2194|
|       Adult|         1982|
|         Old|          906|
|    Teenager|          207|
+------------+-------------+



### alternative method: by creating udf

In [23]:
from pyspark.sql.functions import udf

In [24]:
age_range = udf(lambda age : 'Teenager' if age <25 else
                             'Adult' if (age >=25 and age < 35) else
                             'Middle Aged' if (age >=35 and age <55) else
                             'Old'
               )

In [25]:
bank_customer_df = bank_customer_data.withColumn('age_category',age_range(bank_customer_data.age))

In [26]:
bank_customer_df.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+------------+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|age_category|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+------------+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|         Old|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no| Middle Aged|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|       Adult|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no| Middl

In [29]:
bank_customer_df.createOrReplaceTempView('newBankTable')

In [31]:
spark.sql("""
select age_category,count(*) as success_count from newBankTable group by age_category
order by success_count DESC
""").show()

+------------+-------------+
|age_category|success_count|
+------------+-------------+
| Middle Aged|        24492|
|       Adult|        14204|
|         Old|         5706|
|    Teenager|          809|
+------------+-------------+



### Write a query to show distributed probability rate for each age category

In [None]:
spark.sql("""
  WITH AgeCategoryCounts AS (
    SELECT 
      CASE 
        WHEN age < 25 THEN 'Teenager'
        WHEN age >= 25 AND age <= 35 THEN 'Adult'
        WHEN age >= 35 AND age <= 55 THEN 'MiddleAged'
        ELSE 'Old' 
      END AS age_category,
      COUNT(*) AS count
    FROM banktable 
    WHERE y = 'yes'
    GROUP BY age_category
  ),
  TotalCounts AS (
    SELECT 
      SUM(count) AS total_count
    FROM AgeCategoryCounts
  )
  SELECT 
    age_category,
    count,
    count / total_count AS probability
  FROM AgeCategoryCounts, TotalCounts
  ORDER BY probability DESC
""").show()