In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count

spark = SparkSession.builder \
    .appName("CODTECH Big Data Analysis - Bank Churn") \
    .getOrCreate()


In [3]:
df = spark.read.csv("Bank_Churn.csv", header=True, inferSchema=True)
df.show(5)


+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|
|  15737888|Mitchell|        850|    Spain|Female| 43|     2|1

In [4]:
df.printSchema()
df.count()


root
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



10000

In [6]:
df_clean = df.dropna()


In [7]:
df_clean.count()

10000

In [8]:
df_clean.select(avg("Age")).show()


+--------+
|avg(Age)|
+--------+
| 38.9218|
+--------+



In [9]:
df_clean.groupBy("Exited").count().show()


+------+-----+
|Exited|count|
+------+-----+
|     1| 2037|
|     0| 7963|
+------+-----+



In [10]:
df_clean.groupBy("Exited") \
    .agg(avg("Balance")) \
    .show()


+------+-----------------+
|Exited|     avg(Balance)|
+------+-----------------+
|     1|91108.53933726063|
|     0|72745.29677885193|
+------+-----------------+



In [11]:
df_clean.groupBy("Geography").count().show()


+---------+-----+
|Geography|count|
+---------+-----+
|  Germany| 2509|
|   France| 5014|
|    Spain| 2477|
+---------+-----+



Insights Derived from Big Data Analysis:

The dataset contains a large number of bank customers, demonstrating PySpark’s ability to handle big data efficiently.

The average age of customers provides insight into the bank’s primary customer segment.

A clear difference is observed between churned and retained customers.

Customers who exited tend to maintain different average balance levels.

Geographic analysis shows customer distribution across regions.

PySpark enables scalable and efficient analysis of large datasets.

In [12]:
spark.stop()
