<a href="https://colab.research.google.com/github/luasampaio/data-engineering/blob/main/38_pipeline_ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# Import functions
from pyspark import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import datetime

In [33]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession \
          .builder \
          .appName("Analysis_with_Spark") \
          .getOrCreate()

import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

df = spark.read.csv('/content/drive/MyDrive/.Dataset/Credit2.csv', sep=';', encoding='UTF-8', header=True, inferSchema=True)
df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


DataFrame[ID: int, checking_status: string, credit_history: string, duration: int, credit_amount: int, installment_commitment: int, residence_since: int, age: int, existing_credits: int, num_dependents: int, class: string]

In [34]:
display(df)

DataFrame[ID: int, checking_status: string, credit_history: string, duration: int, credit_amount: int, installment_commitment: int, residence_since: int, age: int, existing_credits: int, num_dependents: int, class: string]

In [35]:
df.show()

+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|      credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  1|             <0|critical/other ex...|       6|         1169|                     4|              4| 67|               2|             1| good|
|  2|       0<=X<200|       existing paid|      48|         5951|                     2|              2| 22|               1|             1|  bad|
|  3|    no checking|critical/other ex...|      12|         2096|                     2|              3| 49|               1|             2| good|
|  4|             <0|       existing paid|      42|         7882|                     2|              4| 45|          

In [36]:
df.columns

['ID',
 'checking_status',
 'credit_history',
 'duration',
 'credit_amount',
 'installment_commitment',
 'residence_since',
 'age',
 'existing_credits',
 'num_dependents',
 'class']

In [37]:
df_existente = df.filter(df["credit_history"] != 'no checking') # Apply filter to the DataFrame
df_existente.show() # Now show the filtered DataFrame

+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|      credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  1|             <0|critical/other ex...|       6|         1169|                     4|              4| 67|               2|             1| good|
|  2|       0<=X<200|       existing paid|      48|         5951|                     2|              2| 22|               1|             1|  bad|
|  3|    no checking|critical/other ex...|      12|         2096|                     2|              3| 49|               1|             2| good|
|  4|             <0|       existing paid|      42|         7882|                     2|              4| 45|          

In [38]:
df_existente.count()

1000

In [39]:
df_existente.groupBy('credit_history').count().show()

+--------------------+-----+
|      credit_history|count|
+--------------------+-----+
|            all paid|   49|
| no credits/all paid|   40|
|critical/other ex...|  293|
|  delayed previously|   88|
|       existing paid|  530|
+--------------------+-----+



In [40]:
df_existente.show()

+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|      credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  1|             <0|critical/other ex...|       6|         1169|                     4|              4| 67|               2|             1| good|
|  2|       0<=X<200|       existing paid|      48|         5951|                     2|              2| 22|               1|             1|  bad|
|  3|    no checking|critical/other ex...|      12|         2096|                     2|              3| 49|               1|             2| good|
|  4|             <0|       existing paid|      42|         7882|                     2|              4| 45|          

In [41]:

# Exibir o número de linhas e colunas do DataFrame
rows, columns = df.count(), len(df.columns)
print(f"Linhas: {rows}\nColunas: {columns}")

Linhas: 1000
Colunas: 11


In [42]:
df_existente.dtypes

[('ID', 'int'),
 ('checking_status', 'string'),
 ('credit_history', 'string'),
 ('duration', 'int'),
 ('credit_amount', 'int'),
 ('installment_commitment', 'int'),
 ('residence_since', 'int'),
 ('age', 'int'),
 ('existing_credits', 'int'),
 ('num_dependents', 'int'),
 ('class', 'string')]

In [43]:
df_existente.describe().show()

+-------+-----------------+---------------+-------------------+------------------+-----------------+----------------------+------------------+-----------------+----------------+------------------+-----+
|summary|               ID|checking_status|     credit_history|          duration|    credit_amount|installment_commitment|   residence_since|              age|existing_credits|    num_dependents|class|
+-------+-----------------+---------------+-------------------+------------------+-----------------+----------------------+------------------+-----------------+----------------+------------------+-----+
|  count|             1000|           1000|               1000|              1000|             1000|                  1000|              1000|             1000|            1000|              1000| 1000|
|   mean|            500.5|           NULL|               NULL|            20.903|         3271.258|                 2.973|             2.845|           35.546|           1.407|           

In [44]:
df_existente.groupBy('credit_history').count().show()

+--------------------+-----+
|      credit_history|count|
+--------------------+-----+
|            all paid|   49|
| no credits/all paid|   40|
|critical/other ex...|  293|
|  delayed previously|   88|
|       existing paid|  530|
+--------------------+-----+



In [45]:
dfdelay = df.filter(df["credit_history"]  == 'delayed previously')
dfdelay.show()

+---+---------------+------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|    credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  5|             <0|delayed previously|      24|         4870|                     3|              4| 53|               2|             2|  bad|
| 30|             <0|delayed previously|      60|         6836|                     3|              4| 63|               2|             1|  bad|
| 43|       0<=X<200|delayed previously|      18|         6204|                     2|              4| 44|               1|             2| good|
| 51|       0<=X<200|delayed previously|      24|         2333|                     4|              2| 29|               1|       

In [46]:
dfdelay.show()

+---+---------------+------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|    credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  5|             <0|delayed previously|      24|         4870|                     3|              4| 53|               2|             2|  bad|
| 30|             <0|delayed previously|      60|         6836|                     3|              4| 63|               2|             1|  bad|
| 43|       0<=X<200|delayed previously|      18|         6204|                     2|              4| 44|               1|             2| good|
| 51|       0<=X<200|delayed previously|      24|         2333|                     4|              2| 29|               1|       