# Create dataset for analysis

This notebook transforms the all-unique-contracts dataset so that each contract is reduced to a single row, where the cost and schedule data at different percent complete points are listed in a single row

In [62]:
import pandas as pd
import numpy as np

In [63]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 500
pd.options.display.max_colwidth = 500

In [64]:
# Import dataset
df = pd.read_excel("../data/datasets/unique-contracts-coded.xlsx")

## Create percentage buckets

In this section, we will:

1. calculate cost overrun

2. calculate schedule overrun

3. round percentages to bucket values

4. new df <- group by percentage with mean function for cost overrun 

5. new df <- group by percentage with mean for schedule overrun

In [65]:
# Remove negative or 0 valued costs - don't make sense and will not work for overrun analysis

df.loc[df["Actual Cost"] <= 0, "Actual Cost"] = np.nan
df.loc[df["Estimated Cost"] <= 0, "Estimated Cost"] = np.nan
df.loc[df["Estimated Cost (exc. contingency)"] <= 0, "Estimated Cost (exc. contingency)"] = np.nan

# Set estimated costs equal to original estimated cost
# Also, create a column for final actual cost
df["final_cost"] = np.nan
for id in df.unique_id.unique():
    df.loc[df.unique_id == id, "Estimated Cost"] = df[df.unique_id==id].sort_values(by="Completion Percentage", ascending=True).iloc[0]["Estimated Cost"]
    df.loc[df.unique_id == id, "Estimated Cost (exc. contingency)"] = df[df.unique_id==id].sort_values(by="Completion Percentage", ascending=True).iloc[0]["Estimated Cost (exc. contingency)"]
    df.loc[df.unique_id == id, "final_cost"] = df[df.unique_id==id]["Actual Cost"].max()

# Calculate cost overruns and normalised expenditures
df["cost_overrun"] = df["Actual Cost"] / df["Estimated Cost (exc. contingency)"]
df["expenditure"] = df["Actual Cost"] / df["final_cost"]


In [66]:
df["Actual Construction Completion Date"] = pd.to_datetime(df["Actual Construction Completion Date"])
df["Estimated Construction Completion Date"] = pd.to_datetime(df["Estimated Construction Completion Date"])
df["Start Date"] = pd.to_datetime(df["Start Date"])
df["Data as of Date"] = pd.to_datetime(df["Data as of Date"])

df["estimated_duration"] = df["Estimated Construction Completion Date"] - df["Start Date"]
df["actual_duration"] = df["Actual Construction Completion Date"] - df["Start Date"]

# Remove negative or 0 valued costs/durations - these dont make sense and won't work for analysis
import datetime
df.loc[df.actual_duration <= datetime.timedelta(0), "actual_duration"] = np.nan
df.loc[df.estimated_duration <= datetime.timedelta(0), "estimated_duration"] = np.nan


df["schedule_overrun"] = df["actual_duration"] / df["estimated_duration"]

In [67]:
import math

df["percentage_bucket"] = df["Completion Percentage"].apply(lambda x: x if pd.isnull(x) else math.floor(x * 10)/10)

In [68]:
df["percentage_bucket"]

0        1.0
1        0.0
2        0.1
3        0.0
4        0.1
        ... 
18765    1.0
18766    1.0
18767    1.0
18768    1.0
18769    1.0
Name: percentage_bucket, Length: 18770, dtype: float64

In [69]:
# Do the percentage between data date and completion align when using buckets rather than floats?

df2 = df.copy(deep=True)
df2["Data date pct"] = (df2["Data as of Date"] - df2["Start Date"]) / (df2["Actual Construction Completion Date"] - df2["Start Date"])
df2["Data date pct"] = df2["Data date pct"].apply(lambda x: 0 if x < 0 else x)
df2["Data date pct"] = df2["Data date pct"].apply(lambda x: 1 if x > 1 else x)
df2["Data date pct"] = df2["Data date pct"].apply(lambda x: x if pd.isnull(x) else math.floor(x * 10)/10)
df2["Percentage delta"] = (df2["Data date pct"] - df2["percentage_bucket"]).apply(abs)

df2["Percentage delta"].describe()

# There is still a difference for over half of the projects

count    15480.000000
mean         0.125711
std          0.166313
min          0.000000
25%          0.000000
50%          0.100000
75%          0.200000
max          1.000000
Name: Percentage delta, dtype: float64

In [70]:
cost_schedule_dict = {}

for id in df.unique_id.unique():
    cost_schedule_dict.setdefault(id, {})
    cost_schedule_dict[id].setdefault("unique_id", id)
    cost_schedule_dict[id].setdefault("project_subtype", df[df.unique_id == id].iloc[0]["Project Subtype"])
    cost_schedule_dict[id].setdefault("project_subtype_two", df[df.unique_id == id].iloc[0]["Project Subtype 2"])
    cost_schedule_dict[id].setdefault("archive_filename", df[df.unique_id == id].iloc[0]["archive_filename"])
    cost_schedule_dict[id].setdefault("start_date", df[df.unique_id == id].iloc[0]["Start Date"])
    cost_schedule_dict[id].setdefault("estimated_completion_date", df[df.unique_id == id].iloc[0]["Estimated Construction Completion Date"])
    cost_schedule_dict[id].setdefault("actual_completion_date", df[df.unique_id == id].sort_values(by="Completion Percentage", ascending=False).iloc[0]["Actual Construction Completion Date"])
    cost_schedule_dict[id].setdefault("estimated_duration", df[df.unique_id == id].iloc[0]["estimated_duration"])
    cost_schedule_dict[id].setdefault("actual_duration", df[df.unique_id == id].sort_values(by="Completion Percentage", ascending=False).iloc[0]["actual_duration"])
    cost_schedule_dict[id].setdefault("fiscal_year", df[df.unique_id == id].iloc[0]["Fiscal Year"])
    cost_schedule_dict[id].setdefault("estimated_cost", df[df.unique_id == id].iloc[0]["Estimated Cost"])
    cost_schedule_dict[id].setdefault("estimated_cost_nocontingency", df[df.unique_id == id].iloc[0]["Estimated Cost (exc. contingency)"])
    cost_schedule_dict[id].setdefault("final_cost", df[df.unique_id == id].iloc[0]["final_cost"])
    
    for j in range(0, 10, 1):
        try:
            cost_schedule_dict[id].setdefault(f"cost_pct{j*10}", df[(df.unique_id == id) & (df.percentage_bucket == j/10)].sort_values(by="Completion Percentage", ascending=False).iloc[0]["cost_overrun"])
        except:
            cost_schedule_dict[id].setdefault(f"cost_pct{j*10}", np.nan)
        try:
            cost_schedule_dict[id].setdefault(f"schedule_pct{j*10}", df[(df.unique_id == id) & (df.percentage_bucket == j/10)].sort_values(by="Completion Percentage", ascending=False).iloc[0]["schedule_overrun"])
        except:
            cost_schedule_dict[id].setdefault(f"schedule_pct{j*10}", np.nan)
        try:
            cost_schedule_dict[id].setdefault(f"expenditure_pct{j*10}", df[(df.unique_id == id) & (df.percentage_bucket == j/10)].sort_values(by="Completion Percentage", ascending=False).iloc[0]["expenditure"])
        except:
            cost_schedule_dict[id].setdefault(f"expenditure_pct{j*10}", np.nan)

        #cost_schedule_dict[id].setdefault(f"schedule_pct{j*10}", df.loc[(df.unique_id == id) & (df.percentage_bucket == j/10), "schedule_overrun"].median())
        #cost_schedule_dict[id].setdefault(f"expenditure_pct{j*10}", df.loc[(df.unique_id == id) & (df.percentage_bucket == j/10), "expenditure"].median())

In [71]:
df3 = pd.DataFrame.from_dict(cost_schedule_dict, orient="index")
df3.count().to_frame().transpose()

# Value counts each column

Unnamed: 0,unique_id,project_subtype,project_subtype_two,archive_filename,start_date,estimated_completion_date,actual_completion_date,estimated_duration,actual_duration,fiscal_year,estimated_cost,estimated_cost_nocontingency,final_cost,cost_pct0,schedule_pct0,expenditure_pct0,cost_pct10,schedule_pct10,expenditure_pct10,cost_pct20,schedule_pct20,expenditure_pct20,cost_pct30,schedule_pct30,expenditure_pct30,cost_pct40,schedule_pct40,expenditure_pct40,cost_pct50,schedule_pct50,expenditure_pct50,cost_pct60,schedule_pct60,expenditure_pct60,cost_pct70,schedule_pct70,expenditure_pct70,cost_pct80,schedule_pct80,expenditure_pct80,cost_pct90,schedule_pct90,expenditure_pct90
0,2395,2367,1936,2395,2344,2343,2305,2288,2226,2394,2368,2368,2371,531,471,535,309,279,309,290,273,291,305,289,307,299,273,299,281,260,281,338,313,338,410,376,411,486,438,486,1190,1060,1191


In [72]:
df3[[f"cost_pct{i}" for i in range(0, 100, 10)]][~df3[[f"cost_pct{i}" for i in range(0, 100, 10)]].isna().any(axis=1)]

# Only projects where we have every collumn filled in for cost

Unnamed: 0,cost_pct0,cost_pct10,cost_pct20,cost_pct30,cost_pct40,cost_pct50,cost_pct60,cost_pct70,cost_pct80,cost_pct90
030_squad battle course,0.90963,0.909759,0.914722,0.914915,0.915044,0.916462,0.9202,0.923294,0.923487,0.926838
"138_bachelor enlisted quarters , wallace creek",0.987606,0.989537,0.995008,0.996295,1.00603,1.008565,1.026547,1.02711,1.046138,1.047707
535_aircraft maintenance hangar (jsf),0.975463,0.984408,0.986046,0.986963,1.013403,1.015664,1.019596,1.034208,1.034667,1.036764
72420_medical/dental clinic addition,1.056145,1.065607,1.065607,1.065607,1.062904,1.062904,1.070154,1.074702,1.074702,1.112921


In [73]:
df3[[f"schedule_pct{i}" for i in range(0, 100, 10)]][~df3[[f"schedule_pct{i}" for i in range(0, 100, 10)]].isna().any(axis=1)]

# Only projects where we have full data for schedule (its the same as cost)

Unnamed: 0,schedule_pct0,schedule_pct10,schedule_pct20,schedule_pct30,schedule_pct40,schedule_pct50,schedule_pct60,schedule_pct70,schedule_pct80,schedule_pct90
030_squad battle course,1.0,1.0,1.0,1.0,1.0,1.0,1.088106,1.088106,1.114537,1.187959
"138_bachelor enlisted quarters , wallace creek",1.042895,1.042895,1.042895,1.042895,1.042895,1.042895,1.042895,1.042895,1.042895,1.042895
535_aircraft maintenance hangar (jsf),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.045802,1.045802,1.181679
72420_medical/dental clinic addition,1.02381,1.039683,1.039683,1.039683,1.039683,1.039683,1.039683,1.050794,1.050794,1.050794


In [74]:
df3.describe()

# We get infinite and negative values for overruns - thats not good!

Unnamed: 0,start_date,estimated_completion_date,actual_completion_date,estimated_duration,actual_duration,fiscal_year,estimated_cost,estimated_cost_nocontingency,final_cost,cost_pct0,schedule_pct0,expenditure_pct0,cost_pct10,schedule_pct10,expenditure_pct10,cost_pct20,schedule_pct20,expenditure_pct20,cost_pct30,schedule_pct30,expenditure_pct30,cost_pct40,schedule_pct40,expenditure_pct40,cost_pct50,schedule_pct50,expenditure_pct50,cost_pct60,schedule_pct60,expenditure_pct60,cost_pct70,schedule_pct70,expenditure_pct70,cost_pct80,schedule_pct80,expenditure_pct80,cost_pct90,schedule_pct90,expenditure_pct90
count,2344,2343,2305,2288,2226,2394.0,2368.0,2368.0,2371.0,531.0,471.0,535.0,309.0,279.0,309.0,290.0,273.0,291.0,305.0,289.0,307.0,299.0,273.0,299.0,281.0,260.0,281.0,338.0,313.0,338.0,410.0,376.0,411.0,486.0,438.0,486.0,1190.0,1060.0,1191.0
mean,2011-12-24 19:51:48.532423424,2013-06-28 14:35:11.139564800,2013-12-27 08:36:39.045553152,569 days 05:33:33.986013984,768 days 11:28:56.927223720,2011.077277,270201900.0,257335100.0,295589800.0,3.589567,1.054451,0.9834,4.417044,1.124542,0.97324,1.021315,1.150461,0.976306,1.011657,1.192183,0.969324,1.014067,1.286756,0.977693,1.023973,1.267234,0.978309,1.120967,1.365104,0.975323,4.477133,1.374603,0.979488,1.13858,1.461779,0.986191,3.074159,1.642192,0.992542
min,2007-05-16 00:00:00,1996-10-08 00:00:00,2007-10-11 00:00:00,1 days 00:00:00,47 days 00:00:00,2006.0,1380.0,1314.286,1498.0,0.00104,0.475936,0.000997,0.52263,0.392727,0.001518,0.246029,0.637181,0.038962,0.11009,0.502479,0.111027,0.256917,0.704132,0.744454,0.548839,0.459534,0.78469,0.555511,0.704132,0.006496,0.40775,0.449807,0.410642,0.672915,0.444444,0.492828,0.559462,0.52518,0.006496
25%,2010-10-13 00:00:00,2012-03-28 00:00:00,2013-01-08 00:00:00,380 days 00:00:00,534 days 06:00:00,2010.0,4804065.0,4575300.0,5106896.0,0.978366,1.0,0.989922,0.984622,1.0,0.975576,0.984593,1.0,0.976886,0.986977,1.0,0.971608,0.98553,1.0,0.978011,0.995908,1.0,0.975551,0.995953,1.0,0.974006,0.998802,1.033047,0.979284,1.011676,1.071679,0.989716,1.019468,1.128658,0.999856
50%,2011-11-17 00:00:00,2013-07-09 00:00:00,2014-01-27 00:00:00,540 days 00:00:00,716 days 00:00:00,2011.0,12142940.0,11564700.0,12521000.0,1.05,1.0,1.0,1.05,1.0,0.995135,1.05,1.001566,0.996021,1.05,1.031111,0.994475,1.049747,1.0625,0.995732,1.05,1.088853,0.994519,1.05,1.1313,0.994872,1.05,1.158572,0.997535,1.06011,1.264145,0.999002,1.067164,1.369026,1.0
75%,2012-11-30 00:00:00,2014-08-13 00:00:00,2014-12-03 00:00:00,716 days 00:00:00,951 days 00:00:00,2012.0,26085250.0,24843100.0,26866000.0,1.05,1.0,1.0,1.052092,1.095372,1.0,1.053716,1.102434,1.0,1.058166,1.177326,1.0,1.062819,1.246101,1.0,1.065357,1.286305,1.0,1.074345,1.391123,1.0,1.082428,1.434915,1.0,1.108489,1.548481,1.0,1.12032,1.724898,1.0
max,2019-02-12 00:00:00,2021-12-17 00:00:00,2022-03-30 00:00:00,2542 days 00:00:00,2952 days 00:00:00,2017.0,50886150000.0,48463000000.0,50847410000.0,1149.921666,4.463235,1.0,1053.551536,4.463235,1.0,5.608296,5.5,1.0,1.545393,7.7,1.0,1.778245,10.725806,1.0,1.417877,10.725806,1.0,26.586742,10.725806,1.0,1381.608212,10.725806,1.0,27.02641,10.725806,1.0,1382.441888,14.44,1.0
std,,,,261 days 19:32:56.227515612,352 days 09:07:15.315570480,1.662585,2563575000.0,2441500000.0,2703078000.0,50.81476,0.243612,0.057353,59.877042,0.357091,0.074061,0.291709,0.415154,0.065989,0.112661,0.609037,0.083589,0.125538,0.883258,0.043271,0.100644,0.715716,0.037858,1.415957,0.864384,0.066984,68.190136,0.784601,0.048324,1.206733,0.831369,0.037523,47.638274,1.098709,0.055734


In [75]:
df3[(df3[[f"schedule_pct{i}" for i in range(0, 100, 10)]].values < 0)]

# Instances of negative overrun seem to be always caused by start dates coming AFTER est. completion dates ...
# This doesn't make sense why the start has been updated, but not the estimated completion. So not sure what we can do with this data besides removing it 

Unnamed: 0,unique_id,project_subtype,project_subtype_two,archive_filename,start_date,estimated_completion_date,actual_completion_date,estimated_duration,actual_duration,fiscal_year,estimated_cost,estimated_cost_nocontingency,final_cost,cost_pct0,schedule_pct0,expenditure_pct0,cost_pct10,schedule_pct10,expenditure_pct10,cost_pct20,schedule_pct20,expenditure_pct20,cost_pct30,schedule_pct30,expenditure_pct30,cost_pct40,schedule_pct40,expenditure_pct40,cost_pct50,schedule_pct50,expenditure_pct50,cost_pct60,schedule_pct60,expenditure_pct60,cost_pct70,schedule_pct70,expenditure_pct70,cost_pct80,schedule_pct80,expenditure_pct80,cost_pct90,schedule_pct90,expenditure_pct90


In [76]:
# Revert the negative schedule values to nans

indxs = df3.loc[
    df3[[f"schedule_pct{i}" for i in range(0, 100, 10)]].values <= 0,
    [f"schedule_pct{i}" for i in range(0, 100, 10)]
].index.tolist()
cols = [f"schedule_pct{i}" for i in range(0, 100, 10)] 

for idx in indxs:
    for col in cols:
        if df3.loc[idx, col] <= 0:
            df3.loc[idx, col] = np.nan


In [77]:
df3[(df3[[f"schedule_pct{i}" for i in range(0, 100, 10)]].values == np.inf)]

# For these projects, the estimated completion and start date are equal. We can remove these aswell, because this makes no sense

Unnamed: 0,unique_id,project_subtype,project_subtype_two,archive_filename,start_date,estimated_completion_date,actual_completion_date,estimated_duration,actual_duration,fiscal_year,estimated_cost,estimated_cost_nocontingency,final_cost,cost_pct0,schedule_pct0,expenditure_pct0,cost_pct10,schedule_pct10,expenditure_pct10,cost_pct20,schedule_pct20,expenditure_pct20,cost_pct30,schedule_pct30,expenditure_pct30,cost_pct40,schedule_pct40,expenditure_pct40,cost_pct50,schedule_pct50,expenditure_pct50,cost_pct60,schedule_pct60,expenditure_pct60,cost_pct70,schedule_pct70,expenditure_pct70,cost_pct80,schedule_pct80,expenditure_pct80,cost_pct90,schedule_pct90,expenditure_pct90


In [78]:
# Revert the inf schedule values to nans

indxs = df3.loc[
    df3[[f"schedule_pct{i}" for i in range(0, 100, 10)]].values == np.inf,
    [f"schedule_pct{i}" for i in range(0, 100, 10)]
].index.tolist()
cols = [f"schedule_pct{i}" for i in range(0, 100, 10)] 

for idx in indxs:
    for col in cols:
        if df3.loc[idx, col] == np.inf:
            df3.loc[idx, col] = np.nan
        


In [79]:
df3[(df3[[f"cost_pct{i}" for i in range(0, 100, 10)]].values == 0)]

# One cost is 0 - remove

Unnamed: 0,unique_id,project_subtype,project_subtype_two,archive_filename,start_date,estimated_completion_date,actual_completion_date,estimated_duration,actual_duration,fiscal_year,estimated_cost,estimated_cost_nocontingency,final_cost,cost_pct0,schedule_pct0,expenditure_pct0,cost_pct10,schedule_pct10,expenditure_pct10,cost_pct20,schedule_pct20,expenditure_pct20,cost_pct30,schedule_pct30,expenditure_pct30,cost_pct40,schedule_pct40,expenditure_pct40,cost_pct50,schedule_pct50,expenditure_pct50,cost_pct60,schedule_pct60,expenditure_pct60,cost_pct70,schedule_pct70,expenditure_pct70,cost_pct80,schedule_pct80,expenditure_pct80,cost_pct90,schedule_pct90,expenditure_pct90


In [80]:
indxs = df3.loc[
    df3[[f"cost_pct{i}" for i in range(0, 100, 10)]].values <= 0,
    [f"cost_pct{i}" for i in range(0, 100, 10)]
].index.tolist()
cols = [f"cost_pct{i}" for i in range(0, 100, 10)] 

for idx in indxs:
    for col in cols:
        if df3.loc[idx, col] <= 0:
            df3.loc[idx, col] = np.nan

In [81]:
df3[(df3[[f"cost_pct{i}" for i in range(0, 100, 10)]].values == np.inf)]

# There are some inf costs. It looks like we didn't have data for the est. costs on these? REMOVE

Unnamed: 0,unique_id,project_subtype,project_subtype_two,archive_filename,start_date,estimated_completion_date,actual_completion_date,estimated_duration,actual_duration,fiscal_year,estimated_cost,estimated_cost_nocontingency,final_cost,cost_pct0,schedule_pct0,expenditure_pct0,cost_pct10,schedule_pct10,expenditure_pct10,cost_pct20,schedule_pct20,expenditure_pct20,cost_pct30,schedule_pct30,expenditure_pct30,cost_pct40,schedule_pct40,expenditure_pct40,cost_pct50,schedule_pct50,expenditure_pct50,cost_pct60,schedule_pct60,expenditure_pct60,cost_pct70,schedule_pct70,expenditure_pct70,cost_pct80,schedule_pct80,expenditure_pct80,cost_pct90,schedule_pct90,expenditure_pct90


In [82]:
# Revert the inf cost values to nans

indxs = df3.loc[
    df3[[f"cost_pct{i}" for i in range(0, 100, 10)]].values == np.inf,
    [f"cost_pct{i}" for i in range(0, 100, 10)]
].index.tolist()
cols = [f"cost_pct{i}" for i in range(0, 100, 10)] 

for idx in indxs:
    for col in cols:
        if df3.loc[idx, col] == np.inf:
            df3.loc[idx, col] = np.nan


In [83]:
df3.describe()

Unnamed: 0,start_date,estimated_completion_date,actual_completion_date,estimated_duration,actual_duration,fiscal_year,estimated_cost,estimated_cost_nocontingency,final_cost,cost_pct0,schedule_pct0,expenditure_pct0,cost_pct10,schedule_pct10,expenditure_pct10,cost_pct20,schedule_pct20,expenditure_pct20,cost_pct30,schedule_pct30,expenditure_pct30,cost_pct40,schedule_pct40,expenditure_pct40,cost_pct50,schedule_pct50,expenditure_pct50,cost_pct60,schedule_pct60,expenditure_pct60,cost_pct70,schedule_pct70,expenditure_pct70,cost_pct80,schedule_pct80,expenditure_pct80,cost_pct90,schedule_pct90,expenditure_pct90
count,2344,2343,2305,2288,2226,2394.0,2368.0,2368.0,2371.0,531.0,471.0,535.0,309.0,279.0,309.0,290.0,273.0,291.0,305.0,289.0,307.0,299.0,273.0,299.0,281.0,260.0,281.0,338.0,313.0,338.0,410.0,376.0,411.0,486.0,438.0,486.0,1190.0,1060.0,1191.0
mean,2011-12-24 19:51:48.532423424,2013-06-28 14:35:11.139564800,2013-12-27 08:36:39.045553152,569 days 05:33:33.986013984,768 days 11:28:56.927223720,2011.077277,270201900.0,257335100.0,295589800.0,3.589567,1.054451,0.9834,4.417044,1.124542,0.97324,1.021315,1.150461,0.976306,1.011657,1.192183,0.969324,1.014067,1.286756,0.977693,1.023973,1.267234,0.978309,1.120967,1.365104,0.975323,4.477133,1.374603,0.979488,1.13858,1.461779,0.986191,3.074159,1.642192,0.992542
min,2007-05-16 00:00:00,1996-10-08 00:00:00,2007-10-11 00:00:00,1 days 00:00:00,47 days 00:00:00,2006.0,1380.0,1314.286,1498.0,0.00104,0.475936,0.000997,0.52263,0.392727,0.001518,0.246029,0.637181,0.038962,0.11009,0.502479,0.111027,0.256917,0.704132,0.744454,0.548839,0.459534,0.78469,0.555511,0.704132,0.006496,0.40775,0.449807,0.410642,0.672915,0.444444,0.492828,0.559462,0.52518,0.006496
25%,2010-10-13 00:00:00,2012-03-28 00:00:00,2013-01-08 00:00:00,380 days 00:00:00,534 days 06:00:00,2010.0,4804065.0,4575300.0,5106896.0,0.978366,1.0,0.989922,0.984622,1.0,0.975576,0.984593,1.0,0.976886,0.986977,1.0,0.971608,0.98553,1.0,0.978011,0.995908,1.0,0.975551,0.995953,1.0,0.974006,0.998802,1.033047,0.979284,1.011676,1.071679,0.989716,1.019468,1.128658,0.999856
50%,2011-11-17 00:00:00,2013-07-09 00:00:00,2014-01-27 00:00:00,540 days 00:00:00,716 days 00:00:00,2011.0,12142940.0,11564700.0,12521000.0,1.05,1.0,1.0,1.05,1.0,0.995135,1.05,1.001566,0.996021,1.05,1.031111,0.994475,1.049747,1.0625,0.995732,1.05,1.088853,0.994519,1.05,1.1313,0.994872,1.05,1.158572,0.997535,1.06011,1.264145,0.999002,1.067164,1.369026,1.0
75%,2012-11-30 00:00:00,2014-08-13 00:00:00,2014-12-03 00:00:00,716 days 00:00:00,951 days 00:00:00,2012.0,26085250.0,24843100.0,26866000.0,1.05,1.0,1.0,1.052092,1.095372,1.0,1.053716,1.102434,1.0,1.058166,1.177326,1.0,1.062819,1.246101,1.0,1.065357,1.286305,1.0,1.074345,1.391123,1.0,1.082428,1.434915,1.0,1.108489,1.548481,1.0,1.12032,1.724898,1.0
max,2019-02-12 00:00:00,2021-12-17 00:00:00,2022-03-30 00:00:00,2542 days 00:00:00,2952 days 00:00:00,2017.0,50886150000.0,48463000000.0,50847410000.0,1149.921666,4.463235,1.0,1053.551536,4.463235,1.0,5.608296,5.5,1.0,1.545393,7.7,1.0,1.778245,10.725806,1.0,1.417877,10.725806,1.0,26.586742,10.725806,1.0,1381.608212,10.725806,1.0,27.02641,10.725806,1.0,1382.441888,14.44,1.0
std,,,,261 days 19:32:56.227515612,352 days 09:07:15.315570480,1.662585,2563575000.0,2441500000.0,2703078000.0,50.81476,0.243612,0.057353,59.877042,0.357091,0.074061,0.291709,0.415154,0.065989,0.112661,0.609037,0.083589,0.125538,0.883258,0.043271,0.100644,0.715716,0.037858,1.415957,0.864384,0.066984,68.190136,0.784601,0.048324,1.206733,0.831369,0.037523,47.638274,1.098709,0.055734


In [84]:
# remove negative durations
import datetime

df3.loc[df3["estimated_duration"] <= datetime.timedelta(0), "estimated_duration"] = np.nan
df3.loc[df3["actual_duration"] <= datetime.timedelta(0), "actual_duration"] = np.nan

In [None]:
df3.loc[df3["cost_pct60"] > 20, ["estimated_cost", "final_cost"] + [f"cost_pct{i}" for i in range(0, 100, 10)]]

# extremely large cost overrun - ~2600%
# This is definately how its entered into the original data sheet - maybe an error on the DoD's end, but definately not a bug

Unnamed: 0,estimated_cost,final_cost,cost_pct0,cost_pct10,cost_pct20,cost_pct30,cost_pct40,cost_pct50,cost_pct60,cost_pct70,cost_pct80,cost_pct90
047338_ait barracks,1655000.0,43079104.0,,,1.064873,,,,26.586742,27.02641,27.02641,27.331154


In [None]:
df3.loc[df3["cost_pct70"] > 1000, ["estimated_cost", "final_cost"] + [f"cost_pct{i}" for i in range(0, 100, 10)]]

# This is almost definately an error in the datasheet. Cost goes from 15k to 20m

Unnamed: 0,estimated_cost,final_cost,cost_pct0,cost_pct10,cost_pct20,cost_pct30,cost_pct40,cost_pct50,cost_pct60,cost_pct70,cost_pct80,cost_pct90
061551_electrical sys upgrade & natural gas sys,15608271.0,20550030000.0,,,,,,,,1381.608212,,1382.441888


In [87]:
df3.loc[df3["schedule_pct50"] > 10, [f"schedule_pct{i}" for i in range(0, 100, 10)]]

Unnamed: 0,schedule_pct0,schedule_pct10,schedule_pct20,schedule_pct30,schedule_pct40,schedule_pct50,schedule_pct60,schedule_pct70,schedule_pct80,schedule_pct90
750_rotary hangar,,,,,10.725806,10.725806,10.725806,10.725806,10.725806,12.193548


In [None]:
df3 = df3[df3.cost_pct90 < 100]

# Removing cost overruns which are 10'000% +

In [89]:
df3.describe()

Unnamed: 0,start_date,estimated_completion_date,actual_completion_date,estimated_duration,actual_duration,fiscal_year,estimated_cost,estimated_cost_nocontingency,final_cost,cost_pct0,schedule_pct0,expenditure_pct0,cost_pct10,schedule_pct10,expenditure_pct10,cost_pct20,schedule_pct20,expenditure_pct20,cost_pct30,schedule_pct30,expenditure_pct30,cost_pct40,schedule_pct40,expenditure_pct40,cost_pct50,schedule_pct50,expenditure_pct50,cost_pct60,schedule_pct60,expenditure_pct60,cost_pct70,schedule_pct70,expenditure_pct70,cost_pct80,schedule_pct80,expenditure_pct80,cost_pct90,schedule_pct90,expenditure_pct90
count,1184,1180,1157,1165,1144,1187.0,1188.0,1188.0,1188.0,96.0,80.0,96.0,95.0,81.0,95.0,95.0,85.0,95.0,119.0,111.0,119.0,127.0,115.0,127.0,127.0,120.0,127.0,188.0,174.0,188.0,257.0,244.0,257.0,343.0,316.0,343.0,1188.0,1057.0,1188.0
mean,2011-06-04 17:05:16.216216320,2012-12-09 21:48:12.203389952,2013-08-18 08:26:33.085566208,566 days 14:02:59.227467808,825 days 18:37:45.734265736,2010.590564,198700000.0,189238100.0,188244500.0,1.008822,1.066289,0.960098,1.027809,1.13503,0.960882,1.014204,1.16934,0.960443,1.014647,1.27475,0.955986,1.016816,1.428168,0.965851,1.0198,1.296574,0.9664,1.166611,1.376156,0.967428,1.14149,1.378932,0.975567,1.153944,1.479025,0.983878,1.165018,1.643299,0.992523
min,2007-09-06 00:00:00,1996-10-08 00:00:00,2009-12-04 00:00:00,35 days 00:00:00,80 days 00:00:00,2006.0,1380.0,1314.286,1498.0,0.559462,0.502479,0.701912,0.670431,0.502479,0.65953,0.755909,0.648649,0.038962,0.758068,0.502479,0.142407,0.544046,0.704132,0.744454,0.548839,0.648649,0.78469,0.555511,0.704132,0.006496,0.635297,0.449807,0.729623,0.672915,0.806971,0.492828,0.559462,0.52518,0.006496
25%,2010-08-23 18:00:00,2012-01-26 00:00:00,2013-01-29 00:00:00,385 days 00:00:00,592 days 18:00:00,2010.0,5289750.0,5037857.0,5730750.0,0.986697,1.0,0.946443,0.991412,1.0,0.951887,0.981673,1.0,0.953127,0.985892,1.0,0.956946,0.985117,1.0,0.962514,0.988626,1.0,0.956396,0.989667,1.0,0.967515,0.989177,1.033047,0.97164,1.007316,1.081003,0.983029,1.019403,1.128571,0.999855
50%,2011-07-13 00:00:00,2013-01-05 12:00:00,2013-09-30 00:00:00,540 days 00:00:00,756 days 00:00:00,2011.0,12181000.0,11600950.0,12765170.0,1.05,1.0,0.973176,1.05,1.0,0.976793,1.042525,1.032787,0.981491,1.042603,1.039683,0.982678,1.038297,1.088853,0.988599,1.05,1.071724,0.984188,1.05,1.107383,0.988145,1.05,1.154795,0.992537,1.057234,1.252954,0.996218,1.067037,1.371053,1.0
75%,2012-05-18 00:00:00,2013-10-25 18:00:00,2014-06-06 00:00:00,716 days 00:00:00,1001 days 00:00:00,2011.0,25042750.0,23850240.0,26601250.0,1.05,1.009161,0.993107,1.055974,1.137856,0.994331,1.056186,1.18192,0.99589,1.055974,1.244242,0.996964,1.060831,1.345475,0.998058,1.057013,1.279008,0.995277,1.071805,1.36981,0.998177,1.074702,1.427876,0.999589,1.106934,1.528344,0.999901,1.119807,1.725131,1.0
max,2014-11-12 00:00:00,2016-09-08 00:00:00,2017-07-01 00:00:00,1825 days 00:00:00,2952 days 00:00:00,2014.0,50886150000.0,48463000000.0,50847410000.0,1.325969,2.340741,1.0,1.317179,2.734848,1.0,1.26378,2.90625,1.0,1.145252,7.7,1.0,1.363589,10.725806,1.0,1.372558,10.725806,1.0,26.586742,10.725806,1.0,27.02641,10.725806,1.0,27.02641,10.725806,1.0,27.331154,14.44,1.0
std,,,,253 days 14:47:32.680196952,362 days 01:11:01.671285140,1.228874,2454189000.0,2337322000.0,2297137000.0,0.099646,0.222765,0.052448,0.082792,0.338331,0.052369,0.074229,0.350374,0.101145,0.072578,0.90023,0.093057,0.098265,1.289644,0.05331,0.101476,0.968411,0.044988,1.867878,1.002419,0.081117,1.624927,0.904033,0.039836,1.413807,0.885817,0.037312,1.14782,1.100064,0.055803


In [None]:
# Persist dataset
#df3.to_csv("../data/datasets/dataset-for-overrun-analysis.csv", index=False)