In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyspark.pandas as ps
import pandas as pd



In [2]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.appName("consistency_test").getOrCreate()

In [4]:
df_fintech = spark.read.csv("clean_fintech.csv", header = True, inferSchema=True)
df_fintech.show(2)

+-------+-----+----+------------+--------+----------+------------------+---------+--------+--------------+-----------+--------+--------------------+--------------+--------+------------+--------+------------+-----------------+------------+--------------+--------------+-------------+-------------+-----------+--------------+-----------+-----------+
|user_id|churn| age|credit_score|deposits|withdrawal|purchases_partners|purchases|cc_taken|cc_recommended|cc_disliked|cc_liked|cc_application_begin|app_downloaded|web_user|app_web_user|ios_user|android_user|registered_phones|payment_type|waiting_4_loan|cancelled_loan|received_loan|rejected_loan|zodiac_sign|rewards_earned|reward_rate|is_referred|
+-------+-----+----+------------+--------+----------+------------------+---------+--------+--------------+-----------+--------+--------------------+--------------+--------+------------+--------+------------+-----------------+------------+--------------+--------------+-------------+-------------+--------

## Transformations to perform:
- Multiply all numeric columns * 2.
- Delete the letter "e" from all str columns.
- Set all bool variables to True.
- Create 3 extra numeric columns:
    - Mean of purchases.
    - Median of age.
    - Mean of credit_score.

In [5]:
df_fintech_2 = df_fintech.select('age','credit_score','purchases','zodiac_sign','payment_type','churn','cancelled_loan','received_loan')

In [6]:
df_fintech_2 = df_fintech_2.withColumn("age", df_fintech_2.age.cast("int"))

In [7]:
df_fintech_3 = df_fintech_2

In [8]:
df_fintech_2.show(3)

+---+-----------------+---------+-----------+------------+-----+--------------+-------------+
|age|     credit_score|purchases|zodiac_sign|payment_type|churn|cancelled_loan|received_loan|
+---+-----------------+---------+-----------+------------+-----+--------------+-------------+
| 21|            577.0|       45|     Pisces|Semi-Monthly|false|         false|        false|
| 31|            519.0|        0|      Virgo|   Bi-Weekly| true|         false|        false|
| 26|542.5155998157956|        0|Sagittarius|      Weekly|false|         false|        false|
+---+-----------------+---------+-----------+------------+-----+--------------+-------------+
only showing top 3 rows



In [9]:
df_fintech_2.printSchema()

root
 |-- age: integer (nullable = true)
 |-- credit_score: double (nullable = true)
 |-- purchases: integer (nullable = true)
 |-- zodiac_sign: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- churn: boolean (nullable = true)
 |-- cancelled_loan: boolean (nullable = true)
 |-- received_loan: boolean (nullable = true)



In [10]:
# df_fintech_2 = df_fintech_2.withColumn("purchases_mean", lit(df_fintech_2.select(mean('purchases')).collect()[0][0]))\
#              .withColumn("score_mean", lit(df_fintech_2.select(mean('credit_score')).collect()[0][0]))\
#             .withColumn("age_median", lit(df_fintech_2.select(median('age')).collect()[0][0]))

In [11]:
def transform_bool(df):
    for c in [f.name for f in df.schema.fields if isinstance(f.dataType, BooleanType)]:
        df = df.withColumn(c, lit(True))
    return df

def transform_str(df):
    for c in [f.name for f in df.schema.fields if isinstance(f.dataType, StringType)]:
        df = df.withColumn(c, regexp_replace(c, 'e', ''))
    return df

def transform_numeric(df):
    for c in [f.name for f in df.schema.fields if isinstance (f.dataType, (IntegerType,DoubleType))]:
        df = df.withColumn(c, df[c]*2)
    return df

def transform_extracols(df):
    df = df.withColumn("purchases_mean", lit(df.select(mean('purchases')).collect()[0][0]))\
             .withColumn("score_mean", lit(df.select(mean('credit_score')).collect()[0][0]))\
            .withColumn("age_median", lit(df.select(median('age')).collect()[0][0]))
    return df

In [12]:
df_fintech_2 = df_fintech_2.transform(transform_str).transform(transform_numeric).transform(transform_bool).transform(transform_extracols)
# result = (
#    df.lazy()
#    .pipe(add_position_column)
#    .pipe(add_squad_number_column)
#    .collect()
#)
#
#result
# https://typethepipe.com/vizs-and-tips/python-polars-pipe-function-to-one-more-columns/ 

In [13]:
df_fintech_2.printSchema()

root
 |-- age: integer (nullable = true)
 |-- credit_score: double (nullable = true)
 |-- purchases: integer (nullable = true)
 |-- zodiac_sign: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- churn: boolean (nullable = false)
 |-- cancelled_loan: boolean (nullable = false)
 |-- received_loan: boolean (nullable = false)
 |-- purchases_mean: double (nullable = false)
 |-- score_mean: double (nullable = false)
 |-- age_median: double (nullable = false)



In [14]:
df_fintech_2.show(5)

+---+-----------------+---------+-----------+------------+-----+--------------+-------------+-----------------+------------------+----------+
|age|     credit_score|purchases|zodiac_sign|payment_type|churn|cancelled_loan|received_loan|   purchases_mean|        score_mean|age_median|
+---+-----------------+---------+-----------+------------+-----+--------------+-------------+-----------------+------------------+----------+
| 42|           1154.0|       90|      Piscs| Smi-Monthly| true|          true|         true|6.318724749692605|1085.1526258518454|      60.0|
| 62|           1038.0|        0|      Virgo|     Bi-Wkly| true|          true|         true|6.318724749692605|1085.1526258518454|      60.0|
| 52|1085.031199631591|        0|Sagittarius|        Wkly| true|          true|         true|6.318724749692605|1085.1526258518454|      60.0|
| 66|           1116.0|        0|         Lo|     Bi-Wkly| true|          true|         true|6.318724749692605|1085.1526258518454|      60.0|
| 52| 

In [39]:
pandas_df = pd.DataFrame({
'age':[42,62,52,66,52],
'credit_score':[1154.0000,1038.0000,1085.0312,1116.0000,1118.0000],
'purchases':[90,0,0,0,0],
'zodiac_sign':['Piscs','Virgo','Sagittarius','Lo','Virgo'],
'payment_type':['Smi-Monthly','Bi-Wkly','Wkly','Bi-Wkly','Bi-Wkly'],
'churn':[True,True,True,True,True],
'cancelled_loan':[True,True,True,True,True],
'received_loan':[True,True,True,True,True],
'purchases_mean':[6.318724749692605,6.318724749692605,6.318724749692605,6.318724749692605,6.318724749692605],
'score_mean':[1085.1526258518454,1085.1526258518454,1085.1526258518454,1085.1526258518454,1085.1526258518454],
'age_median':[60.0,60.0,60.0,60.0,60.0]
})
###
pyspark_schema = StructType([
StructField('age',IntegerType()),
StructField('credit_score',DoubleType()),
StructField('purchases',IntegerType()),
StructField('zodiac_sign',StringType()),
StructField('payment_type',StringType()),
StructField('churn',BooleanType(),False),
StructField('cancelled_loan',BooleanType(),False),
StructField('received_loan',BooleanType(),False),
StructField('purchases_mean',DoubleType(),False),
StructField('score_mean',DoubleType(), False),
StructField('age_median',DoubleType(),False)
])

In [40]:
df_expected = spark.createDataFrame(pandas_df, pyspark_schema)

In [None]:
df_expected.show()

In [None]:
df_expected.printSchema()

In [None]:
df_fintech_2.limit(5).collect()

In [None]:
df_expected.collect()

In [None]:
assert sorted(df_expected.collect()) == sorted(df_fintech_2.limit(5).collect())

---

In [None]:
# Apply transform df_orig inside the assert function.
def assert_transform(df_orig):
    
    #transform orig
    df_orig = df_orig.transform(transform_str).transform(transform_numeric).transform(transform_bool).transform(transform_extracols)
    #expected df
    pandas_df = pd.DataFrame({
    'age':[42,62,52,66,52],
    'credit_score':[1154.0000,1038.0000,1085.0312,1116.0000,1118.0000],
    'purchases':[90,0,0,0,0],
    'zodiac_sign':['Piscs','Virgo','Sagittarius','Lo','Virgo'],
    'payment_type':['Smi-Monthly','Bi-Wkly','Wkly','Bi-Wkly','Bi-Wkly'],
    'churn':[True,True,True,True,True],
    'cancelled_loan':[True,True,True,True,True],
    'received_loan':[True,True,True,True,True],
    'purchases_mean':[18,18,18,18,18],
    'age_median':[52.0,52.0,52.0,52.0,52.0],
    'score_mean':[1102.20624,1102.20624,1102.20624,1102.20624,1102.20624]})
###
    pyspark_schema = StructType([
        StructField('age',IntegerType()),
        StructField('credit_score',DoubleType()),
        StructField('purchases',IntegerType()),
        StructField('zodiac_sign',StringType()),
        StructField('payment_type',StringType()),
        StructField('churn',BooleanType()),
        StructField('cancelled_loan',BooleanType()),
        StructField('received_loan',BooleanType()),
        StructField('purchases_mean',IntegerType()),
        StructField('age_median',DoubleType()),
        StructField('score_mean',DoubleType())
    ])

    df_expected = spark.createDataFrame(pandas_df,schema=pyspark_schema)
    
    assert sorted(df_expected.collect()) == sorted(df_orig.collect())

In [None]:
assert_transform(df_fintech_3.limit(5))