<a href="https://colab.research.google.com/github/luasampaio/data-engineering/blob/main/38_pipeline_ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import functions
from pyspark import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import datetime

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession \
          .builder \
          .appName("Analysis_with_Spark") \
          .getOrCreate()

import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

df = spark.read.csv('/content/drive/MyDrive/.Dataset/Credit2.csv', sep=';', encoding='UTF-8', header=True, inferSchema=True)
df

Mounted at /content/drive


DataFrame[ID: int, checking_status: string, credit_history: string, duration: int, credit_amount: int, installment_commitment: int, residence_since: int, age: int, existing_credits: int, num_dependents: int, class: string]

In [4]:
display(df)

DataFrame[ID: int, checking_status: string, credit_history: string, duration: int, credit_amount: int, installment_commitment: int, residence_since: int, age: int, existing_credits: int, num_dependents: int, class: string]

In [5]:
df.show()

+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|      credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  1|             <0|critical/other ex...|       6|         1169|                     4|              4| 67|               2|             1| good|
|  2|       0<=X<200|       existing paid|      48|         5951|                     2|              2| 22|               1|             1|  bad|
|  3|    no checking|critical/other ex...|      12|         2096|                     2|              3| 49|               1|             2| good|
|  4|             <0|       existing paid|      42|         7882|                     2|              4| 45|          

In [6]:
df.columns

['ID',
 'checking_status',
 'credit_history',
 'duration',
 'credit_amount',
 'installment_commitment',
 'residence_since',
 'age',
 'existing_credits',
 'num_dependents',
 'class']

In [7]:
df_existente = df.filter(df["credit_history"] != 'no checking') # Apply filter to the DataFrame
df_existente.show() # Now show the filtered DataFrame

+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|      credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  1|             <0|critical/other ex...|       6|         1169|                     4|              4| 67|               2|             1| good|
|  2|       0<=X<200|       existing paid|      48|         5951|                     2|              2| 22|               1|             1|  bad|
|  3|    no checking|critical/other ex...|      12|         2096|                     2|              3| 49|               1|             2| good|
|  4|             <0|       existing paid|      42|         7882|                     2|              4| 45|          

In [8]:
df_existente.count()

1000

In [9]:
df_existente.groupBy('credit_history').count().show()

+--------------------+-----+
|      credit_history|count|
+--------------------+-----+
|            all paid|   49|
| no credits/all paid|   40|
|critical/other ex...|  293|
|  delayed previously|   88|
|       existing paid|  530|
+--------------------+-----+



In [10]:
df_existente.show()

+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|      credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  1|             <0|critical/other ex...|       6|         1169|                     4|              4| 67|               2|             1| good|
|  2|       0<=X<200|       existing paid|      48|         5951|                     2|              2| 22|               1|             1|  bad|
|  3|    no checking|critical/other ex...|      12|         2096|                     2|              3| 49|               1|             2| good|
|  4|             <0|       existing paid|      42|         7882|                     2|              4| 45|          

In [11]:

# Exibir o número de linhas e colunas do DataFrame
rows, columns = df.count(), len(df.columns)
print(f"Linhas: {rows}\nColunas: {columns}")

Linhas: 1000
Colunas: 11


In [12]:
df_existente.dtypes

[('ID', 'int'),
 ('checking_status', 'string'),
 ('credit_history', 'string'),
 ('duration', 'int'),
 ('credit_amount', 'int'),
 ('installment_commitment', 'int'),
 ('residence_since', 'int'),
 ('age', 'int'),
 ('existing_credits', 'int'),
 ('num_dependents', 'int'),
 ('class', 'string')]

In [13]:
df_existente.describe().show()

+-------+-----------------+---------------+-------------------+------------------+-----------------+----------------------+------------------+-----------------+----------------+------------------+-----+
|summary|               ID|checking_status|     credit_history|          duration|    credit_amount|installment_commitment|   residence_since|              age|existing_credits|    num_dependents|class|
+-------+-----------------+---------------+-------------------+------------------+-----------------+----------------------+------------------+-----------------+----------------+------------------+-----+
|  count|             1000|           1000|               1000|              1000|             1000|                  1000|              1000|             1000|            1000|              1000| 1000|
|   mean|            500.5|           NULL|               NULL|            20.903|         3271.258|                 2.973|             2.845|           35.546|           1.407|           

In [14]:
df_existente.groupBy('credit_history').count().show()

+--------------------+-----+
|      credit_history|count|
+--------------------+-----+
|            all paid|   49|
| no credits/all paid|   40|
|critical/other ex...|  293|
|  delayed previously|   88|
|       existing paid|  530|
+--------------------+-----+



In [15]:
dfdelay = df.filter(df["credit_history"]  == 'delayed previously')
dfdelay.show()

+---+---------------+------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|    credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  5|             <0|delayed previously|      24|         4870|                     3|              4| 53|               2|             2|  bad|
| 30|             <0|delayed previously|      60|         6836|                     3|              4| 63|               2|             1|  bad|
| 43|       0<=X<200|delayed previously|      18|         6204|                     2|              4| 44|               1|             2| good|
| 51|       0<=X<200|delayed previously|      24|         2333|                     4|              2| 29|               1|       

In [16]:
dfdelay.show()

+---+---------------+------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|    credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|  5|             <0|delayed previously|      24|         4870|                     3|              4| 53|               2|             2|  bad|
| 30|             <0|delayed previously|      60|         6836|                     3|              4| 63|               2|             1|  bad|
| 43|       0<=X<200|delayed previously|      18|         6204|                     2|              4| 44|               1|             2| good|
| 51|       0<=X<200|delayed previously|      24|         2333|                     4|              2| 29|               1|       

In [17]:
df.orderBy('credit_amount',  ascending=False).show()

+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
| ID|checking_status|      credit_history|duration|credit_amount|installment_commitment|residence_since|age|existing_credits|num_dependents|class|
+---+---------------+--------------------+--------+-------------+----------------------+---------------+---+----------------+--------------+-----+
|916|       0<=X<200| no credits/all paid|      48|        18424|                     1|              2| 32|               1|             1|  bad|
| 96|       0<=X<200| no credits/all paid|      54|        15945|                     3|              4| 58|               1|             1|  bad|
|819|             <0|       existing paid|      36|        15857|                     2|              3| 43|               1|             1| good|
|888|       0<=X<200|       existing paid|      48|        15672|                     2|              2| 23|          

In [18]:
df.groupBy('class').count().orderBy('class', ascending=False).show(10)

+-----+-----+
|class|count|
+-----+-----+
| good|  700|
|  bad|  300|
+-----+-----+



In [19]:
df.groupBy('credit_history', 'class').count().orderBy('credit_history', ascending=False).show(10)

+--------------------+-----+-----+
|      credit_history|class|count|
+--------------------+-----+-----+
| no credits/all paid|  bad|   25|
| no credits/all paid| good|   15|
|       existing paid|  bad|  169|
|       existing paid| good|  361|
|  delayed previously|  bad|   28|
|  delayed previously| good|   60|
|critical/other ex...|  bad|   50|
|critical/other ex...| good|  243|
|            all paid|  bad|   28|
|            all paid| good|   21|
+--------------------+-----+-----+



In [None]:
df_filtrado = df.filter(df["credit_history"] == 'delayed previously')
df_filtrado.show()

Filtrando dados com Pyspark

In [20]:
from pyspark.sql.functions import col

total_count = df.count()
print("Total de registros:", total_count)

Total de registros: 1000
