<a href="https://colab.research.google.com/github/moatazkrimchi/Spark/blob/main/Spark_credit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyspark



In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("credit").getOrCreate()

In [None]:
spark

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = spark.read.csv("datasets.csv", inferSchema=True, header=True)

In [None]:
df

DataFrame[Credit_History: double, Gender_Male: int, Married_Yes: int, Dependents_1: int, Dependents_2: int, Dependents_3+: int, Education_Not Graduate: int, Self_Employed_Yes: int, Property_Area_Semiurban: int, Property_Area_Urban: int, Loan_Status_Y: int, ApplicantIncome: int, CoapplicantIncome: double, LoanAmount: double, Loan_Amount_Term: double]

In [None]:
df.printSchema()

root
 |-- Credit_History: double (nullable = true)
 |-- Gender_Male: integer (nullable = true)
 |-- Married_Yes: integer (nullable = true)
 |-- Dependents_1: integer (nullable = true)
 |-- Dependents_2: integer (nullable = true)
 |-- Dependents_3+: integer (nullable = true)
 |-- Education_Not Graduate: integer (nullable = true)
 |-- Self_Employed_Yes: integer (nullable = true)
 |-- Property_Area_Semiurban: integer (nullable = true)
 |-- Property_Area_Urban: integer (nullable = true)
 |-- Loan_Status_Y: integer (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: double (nullable = true)
 |-- Loan_Amount_Term: double (nullable = true)



In [None]:
df.describe().show()

+-------+------------------+------------------+------------------+-------------------+-------------------+-------------------+----------------------+-------------------+-----------------------+-------------------+-------------------+-----------------+------------------+------------------+-----------------+
|summary|    Credit_History|       Gender_Male|       Married_Yes|       Dependents_1|       Dependents_2|      Dependents_3+|Education_Not Graduate|  Self_Employed_Yes|Property_Area_Semiurban|Property_Area_Urban|      Loan_Status_Y|  ApplicantIncome| CoapplicantIncome|        LoanAmount| Loan_Amount_Term|
+-------+------------------+------------------+------------------+-------------------+-------------------+-------------------+----------------------+-------------------+-----------------------+-------------------+-------------------+-----------------+------------------+------------------+-----------------+
|  count|               614|               614|               614|          

In [None]:
df.columns

['Credit_History',
 'Gender_Male',
 'Married_Yes',
 'Dependents_1',
 'Dependents_2',
 'Dependents_3+',
 'Education_Not Graduate',
 'Self_Employed_Yes',
 'Property_Area_Semiurban',
 'Property_Area_Urban',
 'Loan_Status_Y',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term']

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols=['Credit_History',
 'Education_Not Graduate',
 'Self_Employed_Yes',
 'ApplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 ],
 outputCol='features')

In [None]:
output = assembler.transform(df)

In [None]:
df_final = output.select('features', 'Loan_Status_Y')

In [None]:
df_final.show(10)

+--------------------+-------------+
|            features|Loan_Status_Y|
+--------------------+-------------+
|[1.0,0.0,0.0,5849...|            1|
|[1.0,0.0,0.0,4583...|            0|
|[1.0,0.0,1.0,3000...|            1|
|[1.0,1.0,0.0,2583...|            1|
|[1.0,0.0,0.0,6000...|            1|
|[1.0,0.0,1.0,5417...|            1|
|[1.0,1.0,0.0,2333...|            1|
|[0.0,0.0,0.0,3036...|            0|
|[1.0,0.0,0.0,4006...|            1|
|[1.0,0.0,0.0,1284...|            0|
+--------------------+-------------+
only showing top 10 rows



In [None]:
train, test = df_final.randomSplit([0.7, 0.3], seed=42)

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
LR = LogisticRegression(labelCol="Loan_Status_Y")

In [None]:
LRm = LR.fit(train)

In [None]:
LRm.summary

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary at 0x7f548d31c150>

In [None]:
LRm_summary = LRm.summary

In [None]:
LRm_summary.predictions.show()



+--------------------+-------------+--------------------+--------------------+----------+
|            features|Loan_Status_Y|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[0.0,0.0,0.0,1500...|          0.0|[4.10213047304043...|[0.98373163126029...|       0.0|
|[0.0,0.0,0.0,1828...|          0.0|[4.07973337713199...|[0.98336928406759...|       0.0|
|[0.0,0.0,0.0,2138...|          0.0|[4.06456183230965...|[0.98311933840009...|       0.0|
|[0.0,0.0,0.0,2221...|          0.0|[3.93395511492413...|[0.98080935372697...|       0.0|
|[0.0,0.0,0.0,2237...|          0.0|[4.54915969974463...|[0.98953459529814...|       0.0|
|[0.0,0.0,0.0,2483...|          1.0|[3.11288266211774...|[0.95742102547792...|       0.0|
|[0.0,0.0,0.0,2873...|          0.0|[4.14415335912995...|[0.98439065971904...|       0.0|
|[0.0,0.0,0.0,2987...|          0.0|[3.99602038120496...|[0.98194336420020...|       0.0|
|[0.0,0.0,

In [None]:
LRm_summary.predictions.describe().show() 



+-------+-------------------+-------------------+
|summary|      Loan_Status_Y|         prediction|
+-------+-------------------+-------------------+
|  count|                456|                456|
|   mean| 0.6951754385964912| 0.8771929824561403|
| stddev|0.46083866683033564|0.32857603872676244|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
pred_labels = LRm.evaluate(test)

In [None]:
pred_labels.predictions.show()



+--------------------+-------------+--------------------+--------------------+----------+
|            features|Loan_Status_Y|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[0.0,0.0,0.0,2137...|            1|[4.18875248459268...|[0.98506135552231...|       0.0|
|[0.0,0.0,0.0,2400...|            0|[4.07083653338457...|[0.98322315642958...|       0.0|
|[0.0,0.0,0.0,2645...|            0|[4.11370286483207...|[0.98391579943379...|       0.0|
|[0.0,0.0,0.0,2787...|            0|[4.19319616765902...|[0.98512660570626...|       0.0|
|[0.0,0.0,0.0,3062...|            0|[3.15925880630757...|[0.95927199841610...|       0.0|
|[0.0,0.0,0.0,3087...|            0|[4.14900415459179...|[0.98446502066219...|       0.0|
|[0.0,0.0,0.0,3089...|            0|[4.09991988461592...|[0.98369621579015...|       0.0|
|[0.0,0.0,0.0,3340...|            0|[4.18502894722159...|[0.98500646279671...|       0.0|
|[0.0,0.0,

In [None]:
eval = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="Loan_Status_Y")

In [None]:
ac = eval.evaluate(pred_labels.predictions)



In [None]:
ac

0.7260557053009884

In [None]:
df.columns

['Credit_History',
 'Gender_Male',
 'Married_Yes',
 'Dependents_1',
 'Dependents_2',
 'Dependents_3+',
 'Education_Not Graduate',
 'Self_Employed_Yes',
 'Property_Area_Semiurban',
 'Property_Area_Urban',
 'Loan_Status_Y',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term']

In [None]:
df.createOrReplaceTempView("Credit")

sqlDF = spark.sql("SELECT * FROM Credit WHERE Credit_History = '1'")
sqlDF.show()


+--------------+-----------+-----------+------------+------------+-------------+----------------------+-----------------+-----------------------+-------------------+-------------+---------------+-----------------+----------+----------------+
|Credit_History|Gender_Male|Married_Yes|Dependents_1|Dependents_2|Dependents_3+|Education_Not Graduate|Self_Employed_Yes|Property_Area_Semiurban|Property_Area_Urban|Loan_Status_Y|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|
+--------------+-----------+-----------+------------+------------+-------------+----------------------+-----------------+-----------------------+-------------------+-------------+---------------+-----------------+----------+----------------+
|           1.0|          1|          0|           0|           0|            0|                     0|                0|                      0|                  1|            1|           5849|              0.0|     128.0|           360.0|
|           1.0|          1|    

In [None]:
sqlDF1 = spark.sql("SELECT ApplicantIncome, Credit_History FROM Credit WHERE ApplicantIncome > '4000'")
sqlDF1.show()


+---------------+--------------+
|ApplicantIncome|Credit_History|
+---------------+--------------+
|           5849|           1.0|
|           4583|           1.0|
|           6000|           1.0|
|           5417|           1.0|
|           4006|           1.0|
|          12841|           1.0|
|           4950|           1.0|
|           4887|           1.0|
|           7660|           0.0|
|           5955|           1.0|
|           9560|           1.0|
|           4226|           1.0|
|           4166|           1.0|
|           4692|           1.0|
|          12500|           1.0|
|           4166|           1.0|
|           4695|           1.0|
|           5649|           1.0|
|           5821|           1.0|
|           4230|           1.0|
+---------------+--------------+
only showing top 20 rows



In [None]:
sqlDF2 = spark.sql("SELECT ApplicantIncome, Credit_History, Loan_Status_Y FROM Credit WHERE ApplicantIncome > '4000'")
sqlDF2.show()


+---------------+--------------+-------------+
|ApplicantIncome|Credit_History|Loan_Status_Y|
+---------------+--------------+-------------+
|           5849|           1.0|            1|
|           4583|           1.0|            0|
|           6000|           1.0|            1|
|           5417|           1.0|            1|
|           4006|           1.0|            1|
|          12841|           1.0|            0|
|           4950|           1.0|            1|
|           4887|           1.0|            0|
|           7660|           0.0|            0|
|           5955|           1.0|            1|
|           9560|           1.0|            1|
|           4226|           1.0|            1|
|           4166|           1.0|            0|
|           4692|           1.0|            0|
|          12500|           1.0|            0|
|           4166|           1.0|            1|
|           4695|           1.0|            1|
|           5649|           1.0|            1|
|           5

In [None]:
sqlDF3 = spark.sql("SELECT ApplicantIncome, Credit_History, Loan_Status_Y FROM Credit WHERE ApplicantIncome > '4000'")
sqlDF.show()

+--------------+-----------+-----------+------------+------------+-------------+----------------------+-----------------+-----------------------+-------------------+-------------+---------------+-----------------+----------+----------------+
|Credit_History|Gender_Male|Married_Yes|Dependents_1|Dependents_2|Dependents_3+|Education_Not Graduate|Self_Employed_Yes|Property_Area_Semiurban|Property_Area_Urban|Loan_Status_Y|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|
+--------------+-----------+-----------+------------+------------+-------------+----------------------+-----------------+-----------------------+-------------------+-------------+---------------+-----------------+----------+----------------+
|           1.0|          1|          0|           0|           0|            0|                     0|                0|                      0|                  1|            1|           5849|              0.0|     128.0|           360.0|
|           1.0|          1|    