In [1]:
spark

In [9]:
bank_customer_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json", multiLine=True)

In [10]:
bank_customer_data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [11]:
bank_customer_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



#### 1.Display Max, Min and Mean Age of targeted customers

In [12]:
bank_customer_data.createOrReplaceTempView("banktable")

In [13]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [18]:
bank_customer_data.select(max("age")).show()
bank_customer_data.select(min("age")).show()
bank_customer_data.select(round(mean("age"))).show()

+--------+
|max(age)|
+--------+
|      95|
+--------+

+--------+
|min(age)|
+--------+
|      18|
+--------+

+------------------+
|round(avg(age), 0)|
+------------------+
|              41.0|
+------------------+



In [22]:
#spark.sql - alternative method
spark.sql("select max(age), min(age), round(mean(age)) mean from banktable").show()

+--------+--------+----+
|max(age)|min(age)|mean|
+--------+--------+----+
|      95|      18|41.0|
+--------+--------+----+



#### 2. Check tge quality of customers by analyzing targeted customers min, max, mean, median, balance.

In [32]:
bank_customer_data.select(max("balance")).show()
bank_customer_data.select(min("balance")).show()
bank_customer_data.select(round(mean("balance"))).show()

+------------+
|max(balance)|
+------------+
|      102127|
+------------+

+------------+
|min(balance)|
+------------+
|       -8019|
+------------+

+----------------------+
|round(avg(balance), 0)|
+----------------------+
|                1362.0|
+----------------------+



In [31]:
spark.sql("select max(balance), min(abs(balance)), round(mean(balance)) mean, percentile_approx(balance,0.5) median from banktable").show()

+------------+-----------------+------+------+
|max(balance)|min(abs(balance))|  mean|median|
+------------+-----------------+------+------+
|      102127|                0|1362.0|   448|
+------------+-----------------+------+------+



#### 3. Check if age matters in marketing subscription for term Deposit scheme.

In [34]:
spark.sql("select age, count(*) as customer_count from banktable where y = 'yes' group by age order by \
          customer_count desc").show()

+---+--------------+
|age|customer_count|
+---+--------------+
| 32|           221|
| 30|           217|
| 33|           210|
| 35|           209|
| 31|           206|
| 34|           198|
| 36|           195|
| 29|           171|
| 37|           170|
| 28|           162|
| 38|           144|
| 39|           143|
| 27|           141|
| 26|           134|
| 41|           120|
| 46|           118|
| 40|           116|
| 25|           113|
| 47|           113|
| 42|           111|
+---+--------------+
only showing top 20 rows



#### 4. Calculate the marketing success rate

In [44]:
spark.sql("select (select count(*) from banktable where y = 'yes') / count(*) * 100 success_rate from banktable ").show()

+------------------+
|      success_rate|
+------------------+
|11.698480458295547|
+------------------+



#### 5. Calculate marketing failure rate

In [46]:
spark.sql("select round((select count(*) from banktable where y = 'no') / count(*) * 100,3) failure_rate from banktable ").show()

+------------+
|failure_rate|
+------------+
|      88.302|
+------------+



#### 6. check if marital status matters in marketing subscription for term deposit scheme

In [54]:
spark.sql("""
select marital, count(*) as customer_count from banktable 
where y = 'yes' 
group by marital 
order by customer_count desc
          """).show()

+--------+--------------+
| marital|customer_count|
+--------+--------------+
| married|          2755|
|  single|          1912|
|divorced|           622|
+--------+--------------+



#### 8. check if age and marital status together mattered for subscription to term deposit scheme

In [55]:
spark.sql("""select age,marital, count(*) as customer_count from banktable 
where y = 'yes' 
group by age, marital 
order by customer_count desc
          """).show()

+---+-------+--------------+
|age|marital|customer_count|
+---+-------+--------------+
| 30| single|           151|
| 28| single|           138|
| 29| single|           133|
| 32| single|           124|
| 26| single|           121|
| 34|married|           118|
| 31| single|           111|
| 27| single|           110|
| 35|married|           101|
| 36|married|           100|
| 25| single|            99|
| 37|married|            98|
| 33|married|            97|
| 33| single|            97|
| 39|married|            87|
| 32|married|            87|
| 38|married|            86|
| 35| single|            84|
| 47|married|            83|
| 46|married|            80|
+---+-------+--------------+
only showing top 20 rows



#### 9. Compute success rate of each age and status category

In [61]:
spark.sql("""
    select age,marital, count(*) as total,
        SUM(case when y = 'yes' then 1 else 0 end) as count_success,
        SUM(case when y = 'yes' then 1 else 0 end)*100 / count(*) as success_percentage
    from banktable
group by age, marital order by success_percentage desc
""").show()

+---+--------+-----+-------------+------------------+
|age| marital|total|count_success|success_percentage|
+---+--------+-----+-------------+------------------+
| 68|divorced|    6|            6|             100.0|
| 95|divorced|    1|            1|             100.0|
| 93| married|    2|            2|             100.0|
| 92| married|    2|            2|             100.0|
| 90|divorced|    2|            2|             100.0|
| 85|divorced|    1|            1|             100.0|
| 87|divorced|    1|            1|             100.0|
| 86|  single|    1|            1|             100.0|
| 67|divorced|    8|            7|              87.5|
| 62|divorced|    6|            5| 83.33333333333333|
| 85| married|    4|            3|              75.0|
| 76|divorced|    8|            6|              75.0|
| 71|divorced|   11|            8| 72.72727272727273|
| 87| married|    3|            2| 66.66666666666667|
| 84| married|    6|            4| 66.66666666666667|
| 73|divorced|    6|        

In [82]:
spark.sql("""
    select age,marital, count(*) as total,
        (select count(*) from banktable where y = 'yes')/count(*) *100)

    from banktable
group by age, marital
""").show()

ParseException: "\nmismatched input ')' expecting <EOF>(line 3, pos 70)\n\n== SQL ==\n\n    select age,marital, count(*) as total,\n        (select count(*) from banktable where y = 'yes')/count(*) *100)\n----------------------------------------------------------------------^^^\n\n    from banktable\ngroup by age, marital\n"

#### 10. Do features engineering for Bank Investment scheme and find Effect of Age on the champaign

In [78]:
spark.sql("""
    select age_category,count(*) as subscriber_count from (select case when age < 25 then 'Teenager'
                when age >= 25 and age <= 35 then 'Adult'
                when age > 35 and age <= 55 then 'Middle Aged'
                else 'old' 
            end as age_category 
    from banktable where y = 'yes') group by age_category order by subscriber_count desc
""").show()

+------------+----------------+
|age_category|subscriber_count|
+------------+----------------+
| Middle Aged|            2194|
|       Adult|            1982|
|         old|             906|
|    Teenager|             207|
+------------+----------------+



#### Alternative Method : Create UDF

In [83]:
from pyspark.sql.functions import udf

In [87]:
age_range = udf(lambda age : 'Teenager' if age < 20 else 
                             'Adult' if (age >= 25 and age <= 35) else
                            'Middle Aged' if (age >= 35 and age < 55) else
                'Old'
               )

In [88]:
bank_customer_DF = bank_customer_data.withColumn('age_category', age_range(bank_customer_data.age))

In [91]:
bank_customer_DF.createOrReplaceTempView('newbanktable')

In [96]:
spark.sql("""
    select age_category, count(*) as success_count from newbanktable 
    group by age_category
    order by success_count desc
""").show()

+------------+-------------+
|age_category|success_count|
+------------+-------------+
| Middle Aged|        22598|
|       Adult|        16098|
|         Old|         6468|
|    Teenager|           47|
+------------+-------------+



In [89]:
bank_customer_DF.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+------------+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|age_category|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+------------+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|         Old|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no| Middle Aged|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|       Adult|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no| Middl

In [90]:
spark.sql('show databases').show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



#### write a query to display distributed probablity for each category