In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('cau1').getOrCreate()

In [4]:
df = spark.read.csv('Churn_Modelling.csv',inferSchema=True,header=True)

### Explore data

In [5]:
df.count()

10000

In [6]:
df.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [7]:
df.show(5)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|

In [8]:
df.describe().show()

+-------+------------------+-----------------+-------+-----------------+---------+------+------------------+------------------+-----------------+------------------+-------------------+-------------------+-----------------+-------------------+
|summary|         RowNumber|       CustomerId|Surname|      CreditScore|Geography|Gender|               Age|            Tenure|          Balance|     NumOfProducts|          HasCrCard|     IsActiveMember|  EstimatedSalary|             Exited|
+-------+------------------+-----------------+-------+-----------------+---------+------+------------------+------------------+-----------------+------------------+-------------------+-------------------+-----------------+-------------------+
|  count|             10000|            10000|  10000|            10000|    10000| 10000|             10000|             10000|            10000|             10000|              10000|              10000|            10000|              10000|
|   mean|            5000.5|

### Preprocessing

In [9]:
# bỏ bớt các cột không cần thiết
df= df.select('CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary','Exited')

In [10]:
num_dist_rows = df.distinct().count()

In [11]:
# dự liệu không bị trùng
num_dist_rows

10000

In [12]:
# check null, nan
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0
HasCrCard,0
IsActiveMember,0
EstimatedSalary,0


In [13]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas().T

Unnamed: 0,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0
HasCrCard,0
IsActiveMember,0
EstimatedSalary,0


In [14]:
def min_max_scaler(df, cols_to_scale, prefix= 'mm_'):
    for col in cols_to_scale:
        max_values = df.agg({col: 'max'}).collect()[0][0]
        min_values = df.agg({col: 'min'}).collect()[0][0]
        new_column_name = prefix + col
        df = df.withColumn(new_column_name, 
                      (df[col] - min_values) / (max_values - min_values))
    return df
# Normalize các cột CreditScore, Age, Balance, EstimatedSalary
df = min_max_scaler(df, cols_to_scale=['CreditScore','Age','Balance','EstimatedSalary'])

In [15]:
# bỏ các cột ko liên quan và cột Gender, Geography, các cột chưa scale
df= df.select('mm_CreditScore','Geography','Gender','mm_Age','Tenure','mm_Balance','NumOfProducts','HasCrCard','IsActiveMember','mm_EstimatedSalary','Exited')

In [16]:
# train test split
train_df, test_df = df.randomSplit([0.7, 0.3])

### tạo pipeline

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

In [18]:
indexer1 = StringIndexer(inputCol='Gender', outputCol='Gender_idx')

In [19]:
indexer2 = StringIndexer(inputCol='Geography', outputCol='Geography_idx')

In [20]:
encoder = OneHotEncoder(inputCol='Geography_idx',outputCol='Geography_dummy')

In [21]:
assembler = VectorAssembler(inputCols=[
    'mm_CreditScore','Geography_dummy','Gender_idx','mm_Age','Tenure','mm_Balance','NumOfProducts','HasCrCard','IsActiveMember','mm_EstimatedSalary'
    ],outputCol='features')

In [22]:
# đầu tiên thử chọn thuật toán Logistic Regression
log_reg_customer = LogisticRegression(featuresCol='features', labelCol='Exited',predictionCol='prediction')

In [23]:
pipeline = Pipeline(stages=[indexer1, indexer2, encoder, assembler,log_reg_customer])
fit_model = pipeline.fit(train_df)
results = fit_model.transform(test_df)

In [24]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [25]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
 labelCol='Exited')

In [26]:
results.select('Exited','prediction').show()

+------+----------+
|Exited|prediction|
+------+----------+
|     1|       0.0|
|     1|       1.0|
|     1|       0.0|
|     1|       0.0|
|     1|       1.0|
|     1|       0.0|
|     0|       0.0|
|     1|       0.0|
|     0|       1.0|
|     0|       0.0|
|     1|       0.0|
|     0|       0.0|
|     1|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     1|       0.0|
|     0|       0.0|
|     0|       0.0|
+------+----------+
only showing top 20 rows



In [27]:
AUC = my_eval.evaluate(results)
AUC

0.5829906705200069

##### Tỷ lệ chính xác của model đạt 0.59 , tương đối thấp

In [29]:
fit_model.save('Pipeline_customer')

In [30]:
from pyspark.ml import PipelineModel

In [31]:
pipeline2 = PipelineModel.load('Pipeline_customer')

In [32]:
# thử dự đoán khách hàng đề bài cho có exit hay ko

df_given_customer = spark.createDataFrame([
    (600,'France','Male',40,3,60000,2,1,1,50000)
], schema='mm_CreditScore long,Geography string,Gender string,mm_Age long,Tenure long,mm_Balance long,NumOfProducts long,HasCrCard long,IsActiveMember long,mm_EstimatedSalary long')

In [33]:
df.show(5)

+--------------+---------+------+-------------------+------+-------------------+-------------+---------+--------------+------------------+------+
|mm_CreditScore|Geography|Gender|             mm_Age|Tenure|         mm_Balance|NumOfProducts|HasCrCard|IsActiveMember|mm_EstimatedSalary|Exited|
+--------------+---------+------+-------------------+------+-------------------+-------------+---------+--------------+------------------+------+
|         0.538|   France|Female|0.32432432432432434|     2|                0.0|            1|        1|             1|0.5067348931822989|     1|
|         0.516|    Spain|Female| 0.3108108108108108|     1|0.33403147867725896|            1|        0|             1|0.5627087386845443|     0|
|         0.304|   France|Female|0.32432432432432434|     8| 0.6363571759354565|            3|        1|             0|0.5696543519906151|     1|
|         0.698|   France|Female|0.28378378378378377|     1|                0.0|            2|        0|             0|0.469

In [34]:
predict = fit_model.transform(df_given_customer)

In [35]:
predict.select('prediction').show()

+----------+
|prediction|
+----------+
|       1.0|
+----------+



##### logistic regression dự đoán là khách này sẽ exit

In [36]:
# Sử dụng decisionTree
from pyspark.ml.classification import DecisionTreeClassifier
decision_tree_classifier = DecisionTreeClassifier(labelCol='Exited',featuresCol='features')

In [37]:
pipeline = Pipeline(stages=[indexer1, indexer2, encoder, assembler,decision_tree_classifier])
fit_model = pipeline.fit(train_df)
results = fit_model.transform(test_df)

In [38]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Exited",predictionCol="prediction",metricName="accuracy")

In [39]:
dtc_acc = acc_evaluator.evaluate(results)

In [40]:
print('A single decision tree - accuracy: {0:2.2f}%'.format(dtc_acc*100))

A single decision tree - accuracy: 85.80%


In [41]:
AUC = my_eval.evaluate(results)
AUC

0.6642190751429071

##### decision tree cho độ chính xác cao hơn nên better chọn decision tree, tuy nhiên không model nào đạt AUC 0.8+

In [42]:
predict = fit_model.transform(df_given_customer)

In [43]:
predict.select('prediction').show()

+----------+
|prediction|
+----------+
|       0.0|
+----------+



##### decision tree dự đoán là khách này sẽ không exit