## Importing Neccessary Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col
from sqlalchemy import create_engine
import pandas as pd

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName('kumbaConstructionETL').getOrCreate()
#spark = SparkSession.builder.master("local[*]").appName("MyApp").getOrCreate()

spark

In [3]:
# Read data to spark
kumba_construction_df = spark.read.option("multiLine", "true").option("delimiter", ",").csv(r'dataset\kumba_construction_data.csv', header=True, inferSchema=True)

In [4]:
kumba_construction_df.show(100)

+--------------------+--------------------+--------------------+--------------+-------------+----------+----------+---------------+----------+--------------------+------------------+--------+--------------------+----------------------+-------------+---------+-------------+----------+--------------+----------------+--------------------+----------------+----------------+-------------------+-----------------+------------------------+---------------------+---------------+--------------------+----+
|           ProjectID|         ProjectName|          ClientName|   ProjectType|ProjectStatus| StartDate|   EndDate|EstimatedBudget|ActualCost|            Location|       ManagerName|TeamSize|      ContractorName|NumberOfSubcontractors|MaterialsCost|LaborCost|EquipmentCost|PermitFees|InspectionFees|ChangeOrderCount|SafetyIncidentsCount|ProjectDelayDays|WeatherDelayDays|ClientFeedbackScore|QualityAuditScore|EnvironmentalImpactScore|EnergyEfficiencyScore|InnovationScore|CommunityImpactScore| ROI|
+-

In [5]:
kumba_construction_df.printSchema()

root
 |-- ProjectID: string (nullable = true)
 |-- ProjectName: string (nullable = true)
 |-- ClientName: string (nullable = true)
 |-- ProjectType: string (nullable = true)
 |-- ProjectStatus: string (nullable = true)
 |-- StartDate: date (nullable = true)
 |-- EndDate: date (nullable = true)
 |-- EstimatedBudget: integer (nullable = true)
 |-- ActualCost: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- ManagerName: string (nullable = true)
 |-- TeamSize: integer (nullable = true)
 |-- ContractorName: string (nullable = true)
 |-- NumberOfSubcontractors: integer (nullable = true)
 |-- MaterialsCost: double (nullable = true)
 |-- LaborCost: double (nullable = true)
 |-- EquipmentCost: double (nullable = true)
 |-- PermitFees: double (nullable = true)
 |-- InspectionFees: double (nullable = true)
 |-- ChangeOrderCount: double (nullable = true)
 |-- SafetyIncidentsCount: double (nullable = true)
 |-- ProjectDelayDays: double (nullable = true)
 |-- WeatherDelayDays:

In [15]:
kumba_construction_df.columns

['ProjectID',
 'ProjectName',
 'ClientName',
 'ProjectType',
 'ProjectStatus',
 'StartDate',
 'EndDate',
 'EstimatedBudget',
 'ActualCost',
 'Location',
 'ManagerName',
 'TeamSize',
 'ContractorName',
 'NumberOfSubcontractors',
 'MaterialsCost',
 'LaborCost',
 'EquipmentCost',
 'PermitFees',
 'InspectionFees',
 'ChangeOrderCount',
 'SafetyIncidentsCount',
 'ProjectDelayDays',
 'WeatherDelayDays',
 'ClientFeedbackScore',
 'QualityAuditScore',
 'EnvironmentalImpactScore',
 'EnergyEfficiencyScore',
 'InnovationScore',
 'CommunityImpactScore',
 'ROI']

In [6]:
#No of rows
num_rows = kumba_construction_df.count()

num_rows

1100000

In [7]:
# No of columns
num_columns = len(kumba_construction_df.columns)

num_columns

30

In [18]:
# Checking for Null values
for column in kumba_construction_df.columns:
    print(column, 'Nulls', kumba_construction_df.filter(kumba_construction_df[column].isNull()).count())

ProjectID Nulls 0
ProjectName Nulls 0
ClientName Nulls 0
ProjectType Nulls 0
ProjectStatus Nulls 0
StartDate Nulls 0
EndDate Nulls 0
EstimatedBudget Nulls 0
ActualCost Nulls 0
Location Nulls 0
ManagerName Nulls 0
TeamSize Nulls 0
ContractorName Nulls 0
NumberOfSubcontractors Nulls 0
MaterialsCost Nulls 109691
LaborCost Nulls 110048
EquipmentCost Nulls 110204
PermitFees Nulls 109835
InspectionFees Nulls 110074
ChangeOrderCount Nulls 110271
SafetyIncidentsCount Nulls 109748
ProjectDelayDays Nulls 109724
WeatherDelayDays Nulls 109995
ClientFeedbackScore Nulls 110241
QualityAuditScore Nulls 110067
EnvironmentalImpactScore Nulls 109995
EnergyEfficiencyScore Nulls 110693
InnovationScore Nulls 109726
CommunityImpactScore Nulls 110499
ROI Nulls 109712


In [4]:
# Fill up missing values
kumba_construction_df_clean = kumba_construction_df.fillna({
    'MaterialsCost' : 0.0,
    'LaborCost' : 0.0,
    'EquipmentCost' : 0.0,
    'PermitFees' : 0.0,
    'InspectionFees' : 0.0,
    'ChangeOrderCount' : 0.0,
    'SafetyIncidentsCount' : 0.0,
    'ProjectDelayDays' : 0.0,
    'WeatherDelayDays' : 0.0,
    'ClientFeedbackScore' : 0.0,
    'QualityAuditScore' : 0.0,
    'EnvironmentalImpactScore' : 0.0,
    'EnergyEfficiencyScore' : 0.0,
    'InnovationScore' : 0.0,
    'CommunityImpactScore' : 0.0,
    'ROI' : 0.0
})

In [9]:
# Checking for Null values
for column in kumba_construction_df_clean.columns:
    print(column, 'Nulls', kumba_construction_df_clean.filter(kumba_construction_df_clean[column].isNull()).count())

ProjectID Nulls 0
ProjectName Nulls 0
ClientName Nulls 0
ProjectType Nulls 0
ProjectStatus Nulls 0
StartDate Nulls 0
EndDate Nulls 0
EstimatedBudget Nulls 0
ActualCost Nulls 0
Location Nulls 0
ManagerName Nulls 0
TeamSize Nulls 0
ContractorName Nulls 0
NumberOfSubcontractors Nulls 0
MaterialsCost Nulls 0
LaborCost Nulls 0
EquipmentCost Nulls 0
PermitFees Nulls 0
InspectionFees Nulls 0
ChangeOrderCount Nulls 0
SafetyIncidentsCount Nulls 0
ProjectDelayDays Nulls 0
WeatherDelayDays Nulls 0
ClientFeedbackScore Nulls 0
QualityAuditScore Nulls 0
EnvironmentalImpactScore Nulls 0
EnergyEfficiencyScore Nulls 0
InnovationScore Nulls 0
CommunityImpactScore Nulls 0
ROI Nulls 0


In [24]:
kumba_construction_df_clean.describe().show()

+-------+--------------------+--------------------+------------+-----------+-------------+------------------+------------------+--------------------+------------+------------------+--------------+----------------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+--------------------+-----------------+------------------+-------------------+------------------+------------------------+---------------------+------------------+--------------------+-------------------+
|summary|           ProjectID|         ProjectName|  ClientName|ProjectType|ProjectStatus|   EstimatedBudget|        ActualCost|            Location| ManagerName|          TeamSize|ContractorName|NumberOfSubcontractors|    MaterialsCost|        LaborCost|    EquipmentCost|       PermitFees|    InspectionFees| ChangeOrderCount|SafetyIncidentsCount| ProjectDelayDays|  WeatherDelayDays|ClientFeedbackScore| QualityAuditScore|EnvironmentalImpactScore|EnergyEfficiencyS

In [10]:
kumba_construction_df_clean.columns

['ProjectID',
 'ProjectName',
 'ClientName',
 'ProjectType',
 'ProjectStatus',
 'StartDate',
 'EndDate',
 'EstimatedBudget',
 'ActualCost',
 'Location',
 'ManagerName',
 'TeamSize',
 'ContractorName',
 'NumberOfSubcontractors',
 'MaterialsCost',
 'LaborCost',
 'EquipmentCost',
 'PermitFees',
 'InspectionFees',
 'ChangeOrderCount',
 'SafetyIncidentsCount',
 'ProjectDelayDays',
 'WeatherDelayDays',
 'ClientFeedbackScore',
 'QualityAuditScore',
 'EnvironmentalImpactScore',
 'EnergyEfficiencyScore',
 'InnovationScore',
 'CommunityImpactScore',
 'ROI']

In [5]:
#Client Table
client = kumba_construction_df_clean.select('ClientName')

client = client.withColumn('client_id', monotonically_increasing_id())

client = client.select('client_id', 'ClientName')

In [14]:
client.show()

+---------+--------------------+
|client_id|          ClientName|
+---------+--------------------+
|        0|Harrison, Evans a...|
|        1|           Rios-Wang|
|        2|Carrillo, Potts a...|
|        3|Grant, Becker and...|
|        4|Schwartz, Pitts a...|
|        5|   Macdonald-Griffin|
|        6|          Dawson LLC|
|        7|           Davis PLC|
|        8|            Vang Inc|
|        9|Pierce, Huynh and...|
|       10|Joyce, Donaldson ...|
|       11|Barron, Villegas ...|
|       12|    Perkins and Sons|
|       13|Young, Mccullough...|
|       14|       Patton-Garcia|
|       15|           Clark Ltd|
|       16|        Adams-Taylor|
|       17|Anderson, Wade an...|
|       18|         Russell PLC|
|       19|            Hall Inc|
+---------+--------------------+
only showing top 20 rows



In [6]:
# Contractor table
contractor = kumba_construction_df_clean.select('ContractorName', 'NumberOfSubcontractors').distinct()

contractor = contractor.withColumn('contractor_id', monotonically_increasing_id())

contractor = contractor.select('contractor_id', 'ContractorName', 'NumberOfSubcontractors')

In [16]:
contractor.show()

+-------------+--------------------+----------------------+
|contractor_id|      ContractorName|NumberOfSubcontractors|
+-------------+--------------------+----------------------+
|            0|Juarez, Holden an...|                     6|
|            1|   Burnett-Rodriguez|                     3|
|            2|     Chambers-Thomas|                     5|
|            3|          Chavez LLC|                     3|
|            4|Turner, Petty and...|                     8|
|            5|Oneal, Cruz and B...|                     1|
|            6|       Hernandez Ltd|                     6|
|            7|          King Group|                     1|
|            8|      Hernandez-Hill|                     3|
|            9|       Rodriguez Inc|                     2|
|           10|        Castro Group|                     1|
|           11|          Brooks LLC|                     8|
|           12|         Perkins LLC|                     9|
|           13|        Reynolds Ltd|    

In [7]:
manager = kumba_construction_df_clean.select('ManagerName').distinct()

manager = manager.withColumn('manager_id', monotonically_increasing_id())

manager = manager.select('manager_id','ManagerName')

In [32]:
manager.show()

+----------+-----------------+
|manager_id|      ManagerName|
+----------+-----------------+
|         0|     Dennis Ponce|
|         1| Darrell Mckinney|
|         2|    Brooke Herman|
|         3|   Elizabeth Wood|
|         4|  Allen Schneider|
|         5|    Sara Stephens|
|         6|        John Dean|
|         7|      Tracy Smith|
|         8| Robert Gillespie|
|         9|    Adam Williams|
|        10|Victoria Cardenas|
|        11|     Robin Vargas|
|        12|   Katrina Morris|
|        13|Lawrence Williams|
|        14|  Dennis Anderson|
|        15|     Heather Diaz|
|        16|    Tracy Jenkins|
|        17|Patricia Mitchell|
|        18|   Joshua Mercado|
|        19|Brandon Rodriguez|
+----------+-----------------+
only showing top 20 rows



In [8]:
# ProjectType table
projecttype = kumba_construction_df_clean.select('ProjectType').distinct()

projecttype = projecttype.withColumn('projecttype_id', monotonically_increasing_id())

projecttype = projecttype.select('projecttype_id', 'ProjectType')

In [36]:
projecttype.show()

+--------------+--------------+
|projecttype_id|   ProjectType|
+--------------+--------------+
|             0|   Residential|
|             1|Infrastructure|
|             2|    Commercial|
+--------------+--------------+



In [9]:
# Location Table
location = kumba_construction_df_clean.select('Location').distinct()

location = location.withColumn('location_id', monotonically_increasing_id())

location = location.select('location_id', 'Location')

In [38]:
location.show()

+-----------+--------------------+
|location_id|            Location|
+-----------+--------------------+
|          0|8979 Wiggins Fiel...|
|          1|56946 Fernandez T...|
|          2|5729 Angela Hills...|
|          3|82230 Joseph Rapi...|
|          4|6691 Jason Loaf S...|
|          5|491 Renee Radial\...|
|          6|6269 Sarah Loop S...|
|          7|443 Timothy Path ...|
|          8|307 Susan Mountai...|
|          9|287 James Junctio...|
|         10|430 Michael View ...|
|         11|8381 James Ranch\...|
|         12|373 Jeremy Lodge\...|
|         13|7174 Mason Turnpi...|
|         14|9069 Nelson Sprin...|
|         15|33542 Christopher...|
|         16|7536 Brandon Spur...|
|         17|41243 Kirk Valley...|
|         18|PSC 7162, Box 569...|
|         19|84188 David Port ...|
+-----------+--------------------+
only showing top 20 rows



In [10]:
# Fact Table
fact_table = kumba_construction_df_clean.join(client, ['ClientName'], 'left') \
                                        .join(projecttype, ['ProjectType'], 'left') \
                                        .join(contractor, ['ContractorName','NumberOfSubcontractors'], 'left') \
                                        .join(location, ['Location'], 'left') \
                                        .join(manager, ['ManagerName'], 'left') \
                                        .select('ProjectID','ProjectName','ProjectStatus','projecttype_id','manager_id','contractor_id','client_id','location_id','StartDate','EndDate','EstimatedBudget','ActualCost','TeamSize','MaterialsCost','LaborCost','EquipmentCost','PermitFees','InspectionFees','ChangeOrderCount','SafetyIncidentsCount','ProjectDelayDays','WeatherDelayDays','ClientFeedbackScore','QualityAuditScore','EnvironmentalImpactScore','EnergyEfficiencyScore','InnovationScore','CommunityImpactScore','ROI')

In [41]:
fact_table.show(100)

+--------------------+--------------------+-------------+--------------+----------+-------------+---------+-----------+----------+----------+---------------+----------+--------+-------------+---------+-------------+----------+--------------+----------------+--------------------+----------------+----------------+-------------------+-----------------+------------------------+---------------------+---------------+--------------------+----+
|           ProjectID|         ProjectName|ProjectStatus|projecttype_id|manager_id|contractor_id|client_id|location_id| StartDate|   EndDate|EstimatedBudget|ActualCost|TeamSize|MaterialsCost|LaborCost|EquipmentCost|PermitFees|InspectionFees|ChangeOrderCount|SafetyIncidentsCount|ProjectDelayDays|WeatherDelayDays|ClientFeedbackScore|QualityAuditScore|EnvironmentalImpactScore|EnergyEfficiencyScore|InnovationScore|CommunityImpactScore| ROI|
+--------------------+--------------------+-------------+--------------+----------+-------------+---------+-----------

In [17]:
kumba_construction_df_clean.columns

['ProjectID',
 'ProjectName',
 'ClientName',
 'ProjectType',
 'ProjectStatus',
 'StartDate',
 'EndDate',
 'EstimatedBudget',
 'ActualCost',
 'Location',
 'ManagerName',
 'TeamSize',
 'ContractorName',
 'NumberOfSubcontractors',
 'MaterialsCost',
 'LaborCost',
 'EquipmentCost',
 'PermitFees',
 'InspectionFees',
 'ChangeOrderCount',
 'SafetyIncidentsCount',
 'ProjectDelayDays',
 'WeatherDelayDays',
 'ClientFeedbackScore',
 'QualityAuditScore',
 'EnvironmentalImpactScore',
 'EnergyEfficiencyScore',
 'InnovationScore',
 'CommunityImpactScore',
 'ROI']

In [11]:
client.write.mode('overwrite').parquet(r'dataset/client')
contractor.write.mode('overwrite').parquet(r'dataset/contractor')
location.write.mode('overwrite').parquet(r'dataset/location')
manager.write.mode('overwrite').parquet(r'dataset/manager')
projecttype.write.mode('overwrite').parquet(r'dataset/projecttype')
fact_table.write.mode('overwrite').parquet(r'dataset/fact_table')

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:
# Convert spark df to pandas df
client_pd_df = client.toPandas()
contractor_pd_df = contractor.toPandas()
manager_pd_df = manager.toPandas()
projecttype_pd_df = projecttype.toPandas()
location_pd_df = location.toPandas()
fact_table_pd_df = fact_table.toPandas()