In [1]:
# librerie

from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.types import StringType
from pyspark.sql import Row
from pyspark.mllib.regression import LabeledPoint

In [2]:
# sessione

sc = SparkContext(appName="DDAM_Project", master="local[*]")
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("DDAM_Project") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
sdf = spark.read.csv("hdfs://kddrtserver11.isti.cnr.it:9000/user/hpsa04/credit_train.csv", sep=",",
                     inferSchema=True, header=True)

columns = sdf.schema.names

# rinominare le colonne sotituendo lo spazio con l'underscore
for col in columns:
    sdf = sdf.withColumnRenamed(col, col.replace(' ', '_'))

columns = sdf.schema.names

sdf.printSchema()

root
 |-- Loan_ID: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Loan_Status: string (nullable = true)
 |-- Current_Loan_Amount: integer (nullable = true)
 |-- Term: string (nullable = true)
 |-- Credit_Score: integer (nullable = true)
 |-- Annual_Income: integer (nullable = true)
 |-- Years_in_current_job: string (nullable = true)
 |-- Home_Ownership: string (nullable = true)
 |-- Purpose: string (nullable = true)
 |-- Monthly_Debt: double (nullable = true)
 |-- Years_of_Credit_History: double (nullable = true)
 |-- Months_since_last_delinquent: string (nullable = true)
 |-- Number_of_Open_Accounts: integer (nullable = true)
 |-- Number_of_Credit_Problems: integer (nullable = true)
 |-- Current_Credit_Balance: integer (nullable = true)
 |-- Maximum_Open_Credit: integer (nullable = true)
 |-- Bankruptcies: string (nullable = true)
 |-- Tax_Liens: string (nullable = true)



In [40]:
sdf.createOrReplaceTempView('Bank_Loan_Dataset')

In [41]:
columns_problematic = ['Years_in_current_job', 'Months_since_last_delinquent', 'Bankruptcies', 'Tax_Liens']

for col in columns_problematic:
    sql = """
    SELECT DISTINCT {0}
    FROM Bank_Loan_Dataset
    ORDER BY {0}
    """.format(col)

    spark.sql(sql).show(150)

+--------------------+
|Years_in_current_job|
+--------------------+
|                null|
|                   0|
|                   1|
|              1 year|
|           10+ years|
|                   2|
|             2 years|
|                   3|
|             3 years|
|                   4|
|             4 years|
|                   5|
|             5 years|
|                   6|
|             6 years|
|                   7|
|             7 years|
|             8 years|
|             9 years|
|            < 1 year|
|                 n/a|
+--------------------+

+----------------------------+
|Months_since_last_delinquent|
+----------------------------+
|                        null|
|                          -1|
+----------------------------+

+--------------------+
|        Bankruptcies|
+--------------------+
|                null|
|                   0|
|0002005f-3575-462...|
|00062646-806a-4e8...|
|000685bc-92e8-44c...|
|000ae2e0-788e-4b4...|
|000b0ae9-bde2-427...|
|000d63

In [6]:
rdd = sdf.rdd
rdd.take(2)

[Row(Loan_ID='14dd8831-6af5-400b-83ec-68e61888a048', Customer_ID='981165ec-3274-42f5-a3b4-d104041a9ca9', Loan_Status='Fully Paid', Current_Loan_Amount=445412, Term='Short Term', Credit_Score=709, Annual_Income=1167493, Years_in_current_job='8 years', Home_Ownership='Home Mortgage', Purpose='Home Improvements', Monthly_Debt=5214.74, Years_of_Credit_History=17.2, Months_since_last_delinquent='NA', Number_of_Open_Accounts=6, Number_of_Credit_Problems=1, Current_Credit_Balance=228190, Maximum_Open_Credit=416746, Bankruptcies='1', Tax_Liens='0'),
 Row(Loan_ID='4771cc26-131a-45db-b5aa-537ea4ba5342', Customer_ID='2de017a3-2e01-49cb-a581-08169e83be29', Loan_Status='Fully Paid', Current_Loan_Amount=262328, Term='Short Term', Credit_Score=None, Annual_Income=None, Years_in_current_job='10+ years', Home_Ownership='Home Mortgage', Purpose='Debt Consolidation', Monthly_Debt=33295.98, Years_of_Credit_History=21.1, Months_since_last_delinquent='8', Number_of_Open_Accounts=35, Number_of_Credit_Problem

##### 'Years_in_current_job'

sono presenti 4222 valori dell'attributo 'Years_in_current_job' uguali a 'n/a'. Pensiamo si possa trattare di soggetti senza lavoro. Il valore è quindi coerente e viene mantenuto così.

In [7]:
sql = """
    SELECT COUNT(*)
    FROM Bank_Loan_Dataset
    WHERE Years_in_current_job = 'n/a'
    """
spark.sql(sql).show()

+--------+
|count(1)|
+--------+
|    4222|
+--------+



##### 'Months_since_last_delinquent'

sono presenti 53141 valori dell'attributo 'Months_since_last_delinquent' uguali a NA. la nostra interpretazione di questo valore è che il soggetto in questione non ha mai commesso nessun reato. quest'interpretazione dervia anche dal fatto che i valori 'NA' siano la maggior parte.

trasformiamo i valori 'NA' in -1, in modo da rendere numerico l'attributo, i cui valori quindi potranno essere confrontati.

In [8]:
rdd.filter(lambda row: row['Months_since_last_delinquent'] == 'NA').count()

53141

In [9]:
#sdf.withColumn("Months_since_last_delinquent", when(df["Months_since_last_delinquent"] == 'NA', -1))

In [10]:
# le Row sono tipi di Tuple, quindi oggetti immutabili.
# quindi per sostituire il valore 'NA' dobbiamo sovrascrivere l'intera Row

def change_value(row):
    if row['Months_since_last_delinquent'] == 'NA':
        return Row(Loan_ID = row['Loan_ID'],
                Customer_ID = row['Customer_ID'],
                Loan_Status = row['Loan_Status'],
                Current_Loan_Amount = row['Current_Loan_Amount'],
                Term = row['Term'],
                Credit_Score = row['Credit_Score'],
                Annual_Income = row['Annual_Income'],
                Years_in_current_job = row['Years_in_current_job'],
                Home_Ownership = row['Home_Ownership'],
                Purpose = row['Purpose'],
                Monthly_Debt = row['Monthly_Debt'],
                Years_of_Credit_History = row['Years_of_Credit_History'],
                Months_since_last_delinquent = -1,
                Number_of_Open_Accounts = row['Number_of_Open_Accounts'],
                Number_of_Credit_Problems = row['Number_of_Credit_Problems'],
                Current_Credit_Balance = row['Current_Credit_Balance'],
                Maximum_Open_Credit = row['Maximum_Open_Credit'],
                Bankruptcies = row['Bankruptcies'],
                Tax_Liens = row['Tax_Liens'])
    else:
        return row

In [11]:
rdd = rdd.map(change_value)

In [42]:
rdd.take(3)

[Row(Annual_Income=1167493, Bankruptcies='1', Credit_Score=709, Current_Credit_Balance=228190, Current_Loan_Amount=445412, Customer_ID='981165ec-3274-42f5-a3b4-d104041a9ca9', Home_Ownership='Home Mortgage', Loan_ID='14dd8831-6af5-400b-83ec-68e61888a048', Loan_Status='Fully Paid', Maximum_Open_Credit=416746, Monthly_Debt=5214.74, Months_since_last_delinquent=-1, Number_of_Credit_Problems=1, Number_of_Open_Accounts=6, Purpose='Home Improvements', Tax_Liens='0', Term='Short Term', Years_in_current_job='8 years', Years_of_Credit_History=17.2),
 Row(Loan_ID='4771cc26-131a-45db-b5aa-537ea4ba5342', Customer_ID='2de017a3-2e01-49cb-a581-08169e83be29', Loan_Status='Fully Paid', Current_Loan_Amount=262328, Term='Short Term', Credit_Score=None, Annual_Income=None, Years_in_current_job='10+ years', Home_Ownership='Home Mortgage', Purpose='Debt Consolidation', Monthly_Debt=33295.98, Years_of_Credit_History=21.1, Months_since_last_delinquent='8', Number_of_Open_Accounts=35, Number_of_Credit_Problems=

In [12]:
rdd.filter(lambda row: row['Months_since_last_delinquent'] == -1).count()

53141

##### 'Bankruptcies'

sono presenti 204 valori dell'attributo 'Bankruptcies' uguali a NA. Poichè il valore 0 è presente si pensa possa trattarsi di missing values, quindi li trasformiamo in None.

In [13]:
rdd.filter(lambda row: row['Bankruptcies'] == 'NA').count()

204

In [14]:
rdd.filter(lambda row: row['Bankruptcies'] is None).count()

514

In [15]:
def change_value(row):
    if row['Bankruptcies'] == 'NA':
        return Row(Loan_ID = row['Loan_ID'],
                Customer_ID = row['Customer_ID'],
                Loan_Status = row['Loan_Status'],
                Current_Loan_Amount = row['Current_Loan_Amount'],
                Term = row['Term'],
                Credit_Score = row['Credit_Score'],
                Annual_Income = row['Annual_Income'],
                Years_in_current_job = row['Years_in_current_job'],
                Home_Ownership = row['Home_Ownership'],
                Purpose = row['Purpose'],
                Monthly_Debt = row['Monthly_Debt'],
                Years_of_Credit_History = row['Years_of_Credit_History'],
                Months_since_last_delinquent = row['Months_since_last_delinquent'],
                Number_of_Open_Accounts = row['Number_of_Open_Accounts'],
                Number_of_Credit_Problems = row['Number_of_Credit_Problems'],
                Current_Credit_Balance = row['Current_Credit_Balance'],
                Maximum_Open_Credit = row['Maximum_Open_Credit'],
                Bankruptcies = None,
                Tax_Liens = row['Tax_Liens'])
    else:
        return row

In [16]:
rdd = rdd.map(change_value)

In [17]:
rdd.filter(lambda row: row['Bankruptcies'] is None).count()

718

##### 'Tax_Liens'

sono presenti 10 valori dell'attributo 'Tax_Liens' uguali a NA. Poichè il valore 0 è presente si pensa possa trattarsi di missing values, quindi li trasformiamo in None.

In [32]:
rdd.filter(lambda row: row['Tax_Liens'] == 'NA').count()

0

In [19]:
rdd.filter(lambda row: row['Tax_Liens'] is None).count()

514

In [20]:
def change_value(row):
    if row['Tax_Liens'] == 'NA':
        return Row(Loan_ID = row['Loan_ID'],
                Customer_ID = row['Customer_ID'],
                Loan_Status = row['Loan_Status'],
                Current_Loan_Amount = row['Current_Loan_Amount'],
                Term = row['Term'],
                Credit_Score = row['Credit_Score'],
                Annual_Income = row['Annual_Income'],
                Years_in_current_job = row['Years_in_current_job'],
                Home_Ownership = row['Home_Ownership'],
                Purpose = row['Purpose'],
                Monthly_Debt = row['Monthly_Debt'],
                Years_of_Credit_History = row['Years_of_Credit_History'],
                Months_since_last_delinquent = row['Months_since_last_delinquent'],
                Number_of_Open_Accounts = row['Number_of_Open_Accounts'],
                Number_of_Credit_Problems = row['Number_of_Credit_Problems'],
                Current_Credit_Balance = row['Current_Credit_Balance'],
                Maximum_Open_Credit = row['Maximum_Open_Credit'],
                Bankruptcies = row['Bankruptcies'],
                Tax_Liens = None)
    else:
        return row

In [21]:
rdd = rdd.map(change_value)

In [22]:
rdd.filter(lambda row: row['Tax_Liens'] is None).count()

524

In [33]:
sdf = rdd.toDF()

In [34]:
sdf.printSchema()

root
 |-- Annual_Income: long (nullable = true)
 |-- Bankruptcies: string (nullable = true)
 |-- Credit_Score: long (nullable = true)
 |-- Current_Credit_Balance: long (nullable = true)
 |-- Current_Loan_Amount: long (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Home_Ownership: string (nullable = true)
 |-- Loan_ID: string (nullable = true)
 |-- Loan_Status: string (nullable = true)
 |-- Maximum_Open_Credit: long (nullable = true)
 |-- Monthly_Debt: double (nullable = true)
 |-- Months_since_last_delinquent: long (nullable = true)
 |-- Number_of_Credit_Problems: long (nullable = true)
 |-- Number_of_Open_Accounts: long (nullable = true)
 |-- Purpose: string (nullable = true)
 |-- Tax_Liens: string (nullable = true)
 |-- Term: string (nullable = true)
 |-- Years_in_current_job: string (nullable = true)
 |-- Years_of_Credit_History: double (nullable = true)



In [35]:
columns_categorical = [col.name for col in sdf.schema.fields if isinstance(col.dataType, StringType)]

columns_numerical = [col for col in columns if col not in columns_categorical]

In [36]:
columns_numerical

['Current_Loan_Amount',
 'Credit_Score',
 'Annual_Income',
 'Monthly_Debt',
 'Years_of_Credit_History',
 'Months_since_last_delinquent',
 'Number_of_Open_Accounts',
 'Number_of_Credit_Problems',
 'Current_Credit_Balance',
 'Maximum_Open_Credit']

In [37]:
columns_categorical

['Bankruptcies',
 'Customer_ID',
 'Home_Ownership',
 'Loan_ID',
 'Loan_Status',
 'Purpose',
 'Tax_Liens',
 'Term',
 'Years_in_current_job']

In [38]:
def get_nbr_nulls(view_name, columns):
    """funzione per ottenere il numero di valori nulli presenti in ogni colonna dello Spark DF"""

    Project = ''
    for col in columns:
        if col == columns[-1]:
            Project += 'SUM(CASE WHEN {0} IS NULL THEN 1 ELSE 0 END) AS {0}'.format(col)
            break
        Project += 'SUM(CASE WHEN {0} IS NULL THEN 1 ELSE 0 END) AS {0}, '.format(col)


    sql = """\
    SELECT {0}
    FROM {1}\
    """.format(Project, view_name)

    nbr_nulls = spark.sql(sql).collect()[0]

    print('Number of Nulls for each attribute: ')
    for col in columns:
        print(col + ':', '{:>10}'.format(nbr_nulls[col]) )
        
        
    return nbr_nulls

In [39]:
sdf.createOrReplaceTempView('Bank_Loan_Dataset')

nbr_nulls = get_nbr_nulls(view_name = 'Bank_Loan_Dataset', columns = columns)

Number of Nulls for each attribute: 
Loan_ID:        514
Customer_ID:       9159
Loan_Status:        514
Current_Loan_Amount:      47170
Term:        516
Credit_Score:      57679
Annual_Income:      57679
Years_in_current_job:        514
Home_Ownership:       9159
Purpose:        514
Monthly_Debt:        514
Years_of_Credit_History:      47170
Months_since_last_delinquent:      47373
Number_of_Open_Accounts:        514
Number_of_Credit_Problems:      47170
Current_Credit_Balance:        514
Maximum_Open_Credit:      47170
Bankruptcies:        718
Tax_Liens:        524


In [6]:
rdd.count()

100514

In [7]:
rdd = rdd.filter(lambda row: not ( (row['Loan_ID'] is None) and (row['Customer_ID'] is None) ))
rdd.count()

100000

In [8]:
rdd = rdd.distinct()
rdd.count()

89785

In [9]:
sdf = rdd.toDF()

In [10]:
sdf.createOrReplaceTempView('Bank_Loan_Dataset')

nbr_nulls = get_nbr_nulls(view_name = 'Bank_Loan_Dataset', columns = columns)

Number of Nulls for each attribute: 
Loan_ID:          0
Customer_ID:          0
Loan_Status:          0
Current_Loan_Amount:          0
Term:          0
Credit_Score:      19154
Annual_Income:      19154
Years_in_current_job:          0
Home_Ownership:          0
Purpose:          0
Monthly_Debt:          0
Years_of_Credit_History:          0
Months_since_last_delinquent:          0
Number_of_Open_Accounts:          0
Number_of_Credit_Problems:          0
Current_Credit_Balance:          0
Maximum_Open_Credit:          2
Bankruptcies:          0
Tax_Liens:          0


Problema: ci sono coppie di righe con tutti i valori duplicati eccetto per le due colonne 'Credit_Score' e 'Annual_Income', per le quali uno dei due valori è presente e l'altro è nullo.

In [12]:
columns_new = [col for col in columns if col != 'Credit_Score' and col != 'Annual_Income']
Project = ''
for col in columns_new:
    if col == columns[-1]:
        Project += (col)
        break
    Project += (col + ', ')

sql = """
SELECT COUNT(DISTINCT {0}) AS nbr_distinct
FROM Bank_Loan_Dataset
""".format(Project)

spark.sql(sql).show()

+------------+
|nbr_distinct|
+------------+
|       85576|
+------------+



Soluzione: si raggruppa per tutti gli attributi tranne quei due e poi si calcola la media di quei due. In questo modo se le uniche due righe uguali sono quelle con un valore nullo e uno non nullo per quegli attributi, lo media sarà uguale al valore non nullo; se invece ci fossero altre righe ugauli ma con altri valori diversi non nulli per quegli attributi, viene effettivamente calcolata la media, il che è auspicabile considerando che tutto il resto della riga è uguale e quindi si tratta molto probabilmente dello stesso oggetto, duplicato per errore, di cui dunque prendiamo un valore medio tra quelli presenti.

In [13]:
columns_new = [col for col in columns if col != 'Credit_Score' and col != 'Annual_Income']
Project = ''
for col in columns_new:
    if col == columns[-1]:
        Project += (col)
        break
    Project += (col + ', ')

sql = """
SELECT {0}, AVG(Credit_Score) AS Credit_Score, AVG(Annual_Income) AS Annual_Income
FROM Bank_Loan_Dataset
GROUP BY {0}
""".format(Project)

sdf = spark.sql(sql)

In [20]:
rdd = sdf.rdd
rdd.count()

85578

In [21]:
sdf.createOrReplaceTempView('Bank_Loan_Dataset')
nbr_nulls = get_nbr_nulls(view_name = 'Bank_Loan_Dataset', columns = columns)

Number of Nulls for each attribute: 
Loan_ID:          0
Customer_ID:          0
Loan_Status:          0
Current_Loan_Amount:          0
Term:          0
Credit_Score:      14947
Annual_Income:      14947
Years_in_current_job:          0
Home_Ownership:          0
Purpose:          0
Monthly_Debt:          0
Years_of_Credit_History:          0
Months_since_last_delinquent:          0
Number_of_Open_Accounts:          0
Number_of_Credit_Problems:          0
Current_Credit_Balance:          0
Maximum_Open_Credit:          2
Bankruptcies:          0
Tax_Liens:          0


# sei qui

In [26]:
rdd.filter(lambda row: row['Maximum_Open_Credit'] is None).take(2)

[Row(Loan_ID='e42ae1f7-74b5-46d7-95ce-46a88918ba12', Customer_ID='a679ed55-963a-4de6-8be0-4364eb601b6f', Loan_Status='Charged Off', Current_Loan_Amount=45144, Term='Short Term', Years_in_current_job='10+ years', Home_Ownership='Rent', Purpose='Debt Consolidation', Monthly_Debt=36290.38, Years_of_Credit_History=22.4, Months_since_last_delinquent='37', Number_of_Open_Accounts=9, Number_of_Credit_Problems=0, Current_Credit_Balance=0, Maximum_Open_Credit=None, Bankruptcies='0', Tax_Liens='0', Credit_Score=7030.0, Annual_Income=1806995.0),
 Row(Loan_ID='b798f46b-e27b-4823-af83-294b1ffc3490', Customer_ID='f61890c5-e264-4b0f-a047-9cfa03b9d182', Loan_Status='Charged Off', Current_Loan_Amount=32340, Term='Short Term', Years_in_current_job='2 years', Home_Ownership='Rent', Purpose='Debt Consolidation', Monthly_Debt=9620.46, Years_of_Credit_History=15.3, Months_since_last_delinquent='26', Number_of_Open_Accounts=3, Number_of_Credit_Problems=0, Current_Credit_Balance=0, Maximum_Open_Credit=None, B

In [27]:
sql = """
SELECT Loan_Status, AVG(Credit_Score) AS avg_Credit_Score
FROM Bank_Loan_Dataset
GROUP BY Loan_Status """

spark.sql(sql).show()

+-----------+-----------------+
|Loan_Status| avg_Credit_Score|
+-----------+-----------------+
| Fully Paid|721.2176297710638|
|Charged Off|2402.296862289367|
+-----------+-----------------+



In [None]:
pdf = sdf.toPandas()
pdf.head(2)

# Ricordati!

anche se non esistono valori distinti  COUNT(\*)  può differire da COUNT(DISTINCT \*).

perché il primo conta tutte le righe mentre il secondo conta tutte e sole le righe dove non è presente neanche un NULL value

In [35]:
sql = """
SELECT COUNT(*) AS nbr_rows
FROM Bank_Loan_Dataset
"""

spark.sql(sql).show()

+--------+
|nbr_rows|
+--------+
|   89785|
+--------+



In [31]:
sql = """
SELECT COUNT(Credit_Score) AS nbr_rows
FROM Bank_Loan_Dataset
"""

spark.sql(sql).show()

+--------+
|nbr_rows|
+--------+
|   70631|
+--------+



In [32]:
sql = """
SELECT COUNT(DISTINCT *) AS nbr_rows
FROM Bank_Loan_Dataset
"""

spark.sql(sql).show()

+--------+
|nbr_rows|
+--------+
|   70630|
+--------+

