## Importing Neccessary Libraries

In [1]:
from pyspark.sql import SparkSession
from sqlalchemy import create_engine
import pandas as pd

In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName('kumbaConstructionETL').getOrCreate()

In [11]:
# Read data to spark
kumba_construction_df = spark.read.option("multiLine", "true").option("delimiter", ",").csv(r'dataset\kumba_construction_data.csv', header=True, inferSchema=True)

In [12]:
kumba_construction_df.show(100)

+--------------------+--------------------+--------------------+--------------+-------------+----------+----------+---------------+----------+--------------------+------------------+--------+--------------------+----------------------+-------------+---------+-------------+----------+--------------+----------------+--------------------+----------------+----------------+-------------------+-----------------+------------------------+---------------------+---------------+--------------------+----+
|           ProjectID|         ProjectName|          ClientName|   ProjectType|ProjectStatus| StartDate|   EndDate|EstimatedBudget|ActualCost|            Location|       ManagerName|TeamSize|      ContractorName|NumberOfSubcontractors|MaterialsCost|LaborCost|EquipmentCost|PermitFees|InspectionFees|ChangeOrderCount|SafetyIncidentsCount|ProjectDelayDays|WeatherDelayDays|ClientFeedbackScore|QualityAuditScore|EnvironmentalImpactScore|EnergyEfficiencyScore|InnovationScore|CommunityImpactScore| ROI|
+-

In [13]:
kumba_construction_df.printSchema()

root
 |-- ProjectID: string (nullable = true)
 |-- ProjectName: string (nullable = true)
 |-- ClientName: string (nullable = true)
 |-- ProjectType: string (nullable = true)
 |-- ProjectStatus: string (nullable = true)
 |-- StartDate: date (nullable = true)
 |-- EndDate: date (nullable = true)
 |-- EstimatedBudget: integer (nullable = true)
 |-- ActualCost: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- ManagerName: string (nullable = true)
 |-- TeamSize: integer (nullable = true)
 |-- ContractorName: string (nullable = true)
 |-- NumberOfSubcontractors: integer (nullable = true)
 |-- MaterialsCost: double (nullable = true)
 |-- LaborCost: double (nullable = true)
 |-- EquipmentCost: double (nullable = true)
 |-- PermitFees: double (nullable = true)
 |-- InspectionFees: double (nullable = true)
 |-- ChangeOrderCount: double (nullable = true)
 |-- SafetyIncidentsCount: double (nullable = true)
 |-- ProjectDelayDays: double (nullable = true)
 |-- WeatherDelayDays:

In [15]:
kumba_construction_df.columns

['ProjectID',
 'ProjectName',
 'ClientName',
 'ProjectType',
 'ProjectStatus',
 'StartDate',
 'EndDate',
 'EstimatedBudget',
 'ActualCost',
 'Location',
 'ManagerName',
 'TeamSize',
 'ContractorName',
 'NumberOfSubcontractors',
 'MaterialsCost',
 'LaborCost',
 'EquipmentCost',
 'PermitFees',
 'InspectionFees',
 'ChangeOrderCount',
 'SafetyIncidentsCount',
 'ProjectDelayDays',
 'WeatherDelayDays',
 'ClientFeedbackScore',
 'QualityAuditScore',
 'EnvironmentalImpactScore',
 'EnergyEfficiencyScore',
 'InnovationScore',
 'CommunityImpactScore',
 'ROI']

In [16]:
#No of rows
num_rows = kumba_construction_df.count()

num_rows

1100000

In [17]:
# No of columns
num_columns = len(kumba_construction_df.columns)

num_columns

30

In [18]:
# Checking for Null values
for column in kumba_construction_df.columns:
    print(column, 'Nulls', kumba_construction_df.filter(kumba_construction_df[column].isNull()).count())

ProjectID Nulls 0
ProjectName Nulls 0
ClientName Nulls 0
ProjectType Nulls 0
ProjectStatus Nulls 0
StartDate Nulls 0
EndDate Nulls 0
EstimatedBudget Nulls 0
ActualCost Nulls 0
Location Nulls 0
ManagerName Nulls 0
TeamSize Nulls 0
ContractorName Nulls 0
NumberOfSubcontractors Nulls 0
MaterialsCost Nulls 109691
LaborCost Nulls 110048
EquipmentCost Nulls 110204
PermitFees Nulls 109835
InspectionFees Nulls 110074
ChangeOrderCount Nulls 110271
SafetyIncidentsCount Nulls 109748
ProjectDelayDays Nulls 109724
WeatherDelayDays Nulls 109995
ClientFeedbackScore Nulls 110241
QualityAuditScore Nulls 110067
EnvironmentalImpactScore Nulls 109995
EnergyEfficiencyScore Nulls 110693
InnovationScore Nulls 109726
CommunityImpactScore Nulls 110499
ROI Nulls 109712


In [20]:
# Fill up missing values
kumba_construction_df_clean = kumba_construction_df.fillna({
    'MaterialsCost' : 0.0,
    'LaborCost' : 0.0,
    'EquipmentCost' : 0.0,
    'PermitFees' : 0.0,
    'InspectionFees' : 0.0,
    'ChangeOrderCount' : 0.0,
    'SafetyIncidentsCount' : 0.0,
    'ProjectDelayDays' : 0.0,
    'WeatherDelayDays' : 0.0,
    'ClientFeedbackScore' : 0.0,
    'QualityAuditScore' : 0.0,
    'EnvironmentalImpactScore' : 0.0,
    'EnergyEfficiencyScore' : 0.0,
    'InnovationScore' : 0.0,
    'CommunityImpactScore' : 0.0,
    'ROI' : 0.0
})

In [22]:
# Checking for Null values
for column in kumba_construction_df_clean.columns:
    print(column, 'Nulls', kumba_construction_df_clean.filter(kumba_construction_df_clean[column].isNull()).count())

ProjectID Nulls 0
ProjectName Nulls 0
ClientName Nulls 0
ProjectType Nulls 0
ProjectStatus Nulls 0
StartDate Nulls 0
EndDate Nulls 0
EstimatedBudget Nulls 0
ActualCost Nulls 0
Location Nulls 0
ManagerName Nulls 0
TeamSize Nulls 0
ContractorName Nulls 0
NumberOfSubcontractors Nulls 0
MaterialsCost Nulls 0
LaborCost Nulls 0
EquipmentCost Nulls 0
PermitFees Nulls 0
InspectionFees Nulls 0
ChangeOrderCount Nulls 0
SafetyIncidentsCount Nulls 0
ProjectDelayDays Nulls 0
WeatherDelayDays Nulls 0
ClientFeedbackScore Nulls 0
QualityAuditScore Nulls 0
EnvironmentalImpactScore Nulls 0
EnergyEfficiencyScore Nulls 0
InnovationScore Nulls 0
CommunityImpactScore Nulls 0
ROI Nulls 0


In [24]:
kumba_construction_df_clean.describe().show()

+-------+--------------------+--------------------+------------+-----------+-------------+------------------+------------------+--------------------+------------+------------------+--------------+----------------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+--------------------+-----------------+------------------+-------------------+------------------+------------------------+---------------------+------------------+--------------------+-------------------+
|summary|           ProjectID|         ProjectName|  ClientName|ProjectType|ProjectStatus|   EstimatedBudget|        ActualCost|            Location| ManagerName|          TeamSize|ContractorName|NumberOfSubcontractors|    MaterialsCost|        LaborCost|    EquipmentCost|       PermitFees|    InspectionFees| ChangeOrderCount|SafetyIncidentsCount| ProjectDelayDays|  WeatherDelayDays|ClientFeedbackScore| QualityAuditScore|EnvironmentalImpactScore|EnergyEfficiencyS