In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("https://datasocibmproject.s3.ap-southeast-2.amazonaws.com/structured_data/capital_project_schedules_and_budgets_1.csv")
df = df.drop(columns=['Unnamed: 0'])

In [3]:
# EXAMPLE: CLEAN DATA HERE
df['final_estimate_of_actual_costs_through_end_of_phase_amount'].fillna(df['final_estimate_of_actual_costs_through_end_of_phase_amount'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['final_estimate_of_actual_costs_through_end_of_phase_amount'].fillna(df['final_estimate_of_actual_costs_through_end_of_phase_amount'].mean(), inplace=True)


In [4]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
df_train.shape, df_test.shape

((6548, 15), (1637, 15))

In [6]:
df.columns

Index(['project_geographic_district', 'project_building_identifier',
       'project_school_name', 'project_type', 'project_description',
       'project_phase_name', 'project_status_name',
       'project_phase_actual_start_date', 'project_phase_planned_end_date',
       'project_phase_actual_end_date', 'project_budget_amount',
       'final_estimate_of_actual_costs_through_end_of_phase_amount',
       'total_phase_actual_spending_amount', 'dsf_number_s', 'failure'],
      dtype='object')

In [7]:
# CHOOSE THE MODEL YOU WANT TO USE
# model = LogisticRegression()
model = RandomForestClassifier()

"""
Generates predicted values for the 'failure' column in the test dataset

Usage: predict(['col_a', 'col_b', 'col_c'])
"""
def predict(features):
    model.fit(df_train[features], df_train['failure'])
    y_pred = model.predict(df_test[features])
    return y_pred


"""
Evaluates predictions against the true values in the test dataset

Usage: eval(y_pred)
"""
def eval(y_pred):
    y_actual = df_test['failure']

    # Print number of predictions and percentage
    print("-----------------------------------------")
    print("Predictions:")
    print(f"failure=true: {int(np.sum(y_pred))}/{len(y_pred)} ({np.sum(y_pred) / len(y_pred):.2%})")
    print(f"failure=false: {int(len(y_pred) - np.sum(y_pred))}/{len(y_pred)} ({1 - np.sum(y_pred) / len(y_pred):.2%})")
    print("-----------------------------------------")

    # Print accuracy
    print(f"Accuracy: {np.mean(y_pred == y_actual):.5%}")
    print("-----------------------------------------")

    # Compute confusion matrix
    cm = confusion_matrix(y_actual, y_pred)

    # Extract values from confusion matrix
    tn, fp, fn, tp = cm.ravel()

    # Calculate totals
    total_positives = tp + fn
    total_negatives = tn + fp

    # Print confusion matrix
    print(f"True positives: {tp}/{total_positives} ({tp / total_positives:.2%})")
    print(f"True negatives: {tn}/{total_negatives} ({tn / total_negatives:.2%})")
    print(f"False positives: {fp}/{total_negatives} ({fp / total_negatives:.2%})")
    print(f"False negatives: {fn}/{total_positives} ({fn / total_positives:.2%})")
    print("-----------------------------------------")

In [8]:
# Baseline, all no failure
eval(np.zeros(df_test.shape[0]))

-----------------------------------------
Predictions:
failure=true: 0/1637 (0.00%)
failure=false: 1637/1637 (100.00%)
-----------------------------------------
Accuracy: 84.91142%
-----------------------------------------
True positives: 0/247 (0.00%)
True negatives: 1390/1390 (100.00%)
False positives: 0/1390 (0.00%)
False negatives: 247/247 (100.00%)
-----------------------------------------


In [9]:
# EXAMPLE 
y_pred = predict(['final_estimate_of_actual_costs_through_end_of_phase_amount'])
eval(y_pred)

-----------------------------------------
Predictions:
failure=true: 223/1637 (13.62%)
failure=false: 1414/1637 (86.38%)
-----------------------------------------
Accuracy: 86.07208%
-----------------------------------------
True positives: 121/247 (48.99%)
True negatives: 1288/1390 (92.66%)
False positives: 102/1390 (7.34%)
False negatives: 126/247 (51.01%)
-----------------------------------------
