In [None]:
# # fit model no training data
# model = XGBClassifier()
# model.fit(X_train, y_train)

# # make predictions for test data
# y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]

# # see accuracy of model
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [82]:
import pandas as pd
import datetime as datetime
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# for xgboost 
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, average_precision_score, recall_score


In [83]:
df = pd.read_csv("FinalFinal.csv")

df.columns
# select non join columns and ML needed columns
columns = ['MNL_FEAT_1', 'MNL_MATE_1', 'MNL_LENGTH', 'MNL_INSTAL',
       'Width', 'NEAR_DIST', 'MUSYM', 'ARTCLASS', 'BLOCKNBR', 
       'SPEEDLIMIT', 'SURFACEWID', 'SURFACETYP', 'SLOPE_PCT', 'Size', 'DATE']
non_joins = df[columns]
# non_joins.head()

In [84]:
# select only years
non_joins['MNL_INSTAL'] = pd.to_datetime(non_joins['MNL_INSTAL'], format='%m/%d/%Y %H:%M:%S')
non_joins['MNL_INSTAL'] = non_joins['MNL_INSTAL'].map(lambda x: x.year)

non_joins['DATE'] = pd.to_datetime(non_joins['DATE'], format='%m/%d/%Y %H:%M:%S')
non_joins['DATE'] = non_joins['DATE'].map(lambda x: x.year)

#non_joins.head()

In [85]:
# filter out NaN from DATE col
breaks_df = non_joins[non_joins['DATE'].notna()]
# set size column to width column and drop size 
breaks_df['Width'] = breaks_df['Size']
breaks_df = breaks_df.drop('Size', axis=1)
# breaks_df.head()

## ML Pseudo Steps
#### 1) assign binary variable --> can be raw data set
#### 2) create dummy variables for categorical columns
* make sure soil column ['MUSYM'] is categorical
#### 3) list of years (and cycle through those)
#### 4) will have six year groups :
* train on [2009, 2010, 2011], test on [2012, 2013, 2014]
* move onto [2010, 2011, 2012], test on [2013, 2014, 2015], etc ...
#### 5) create function to create subset of data
* want to include where there is NOT a break year (those will be our non-broken positive examples)
* want to include where break year is in time frame of what we want
* exclude installs AFTER time frame window
* based on time window, calculate appropriate age of pipes 
    * (select beginning year of time frame --> ex: [2009, 2010, 2011] subtract install year from 2009)


In [97]:
pseudo_df = non_joins
# pseudo_df.head()

# fill Nan with 0 (idk why, but it just made it work FOR NOW)
# if date is not 0 (NaN), make width = size 
# then for all df, drop size column

pseudo_df['DATE'] = pseudo_df['DATE'].fillna(0)
pseudo_df.loc[pseudo_df['DATE'] != 0, ['Width']] = pseudo_df['Size']
pseudo_df = pseudo_df.drop('Size', axis=1)
# pseudo_df.head()

# change 'Width' values to numbers instead of strings (for dummy prep)
pseudo_df['Width'] = pd.to_numeric(pseudo_df['Width'], errors='coerce')
# pseudo_df['Width'].unique()

# Assign binary variables:
# [Create new column] If pipe has a broken date --> broken pipes = 0, non-broken pipes = 1
pseudo_df['TARGET'] = pseudo_df['DATE'].apply(lambda x: 1 if x == 0 else 0)


In [87]:
# pseudo_df = pseudo_df.drop('DATE', axis=1)

In [98]:
# Create dummy variables
dummy_df = pd.get_dummies(pseudo_df)
# test_df.columns
dummy_df.head()

Unnamed: 0,MNL_LENGTH,MNL_INSTAL,Width,NEAR_DIST,MUSYM,ARTCLASS,BLOCKNBR,SPEEDLIMIT,SURFACEWID,SLOPE_PCT,DATE,TARGET,MNL_FEAT_1_Mainline,MNL_FEAT_1_PHANTOM CONNECTOR,MNL_FEAT_1_Stub,MNL_FEAT_1_Unknown,MNL_MATE_1_,MNL_MATE_1_Acrylonitrile Butadiene Styrene,MNL_MATE_1_Asbestos Cement,MNL_MATE_1_Brick,MNL_MATE_1_Cast Iron Pipe,MNL_MATE_1_Concrete,MNL_MATE_1_Corrugated Flexible Plastic,MNL_MATE_1_Corrugated Metal Pipe,MNL_MATE_1_Corrugated Rigid Plastic,MNL_MATE_1_Ductile Iron Pipe,MNL_MATE_1_High Density Polyethylene,MNL_MATE_1_Other,MNL_MATE_1_Polyvinyl Chloride,MNL_MATE_1_Reinforced Concrete Box,MNL_MATE_1_Reinforced Concrete Pipe,MNL_MATE_1_Steel,MNL_MATE_1_Unknown,MNL_MATE_1_Vitrified Clay,MNL_MATE_1_Wood Stave Pipe,SURFACETYP_,SURFACETYP_AC,SURFACETYP_AC/AC,SURFACETYP_AC/PCC,SURFACETYP_GRAVEL,SURFACETYP_PCC,SURFACETYP_ST
0,314.79,1972,8.0,3891.620149,3055,2,9400,25,40,0,0.0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,363.39,1972,8.0,3609.210867,3056,0,0,20,46,4,0.0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,323.51,1972,8.0,3451.254202,3056,0,0,20,46,4,0.0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,329.13,1928,12.0,771.313134,3056,0,5100,20,0,6,0.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
4,273.64,1928,18.0,833.92116,3056,2,10300,25,42,4,0.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [99]:
def get_data(df, start, end):
    """
    Takes in df and filters depending on timeframe (start and end years).
    Returns subset of data for training in timeframe.
    """
    # want to include where there is NOT a break year (those will be our non-broken positive examples --> 'DATE' == 0) - DATE col
    # want to include where break year is in time frame of what we want - DATE col
    train_df = df[(df['DATE'] == 0 )| ((df['DATE'] >= start) & (df['DATE'] <= end))]

    # exclude installs AFTER time frame window - MNL_INSTAL col
    train_df = train_df[(train_df['MNL_INSTAL'] <= end)]

    # based on time window, calculate appropriate age of pipes (select beginning year of time frame --> ex: 2009, 2010, 2011
    #       subtract install year from 2009) - MNL_INSTAL col
    # -- will create negative numbers
    train_df['AGE'] = start - train_df['MNL_INSTAL']

    return train_df


In [100]:
train_df = get_data(dummy_df, 2009, 2011)
# train_df.index

In [101]:
# trains
# tests
# move over
# rinse & repeat until 2019
def split_df(df):
    """
    """    
    df = df.dropna()
    feature_df = df.drop(['TARGET', 'DATE'], axis=1)
    target_df = df[['TARGET']]

    return feature_df, target_df

def train_v1(df, model):
    """
    """
    feature, target = split_df(df)
    etc = ExtraTreesClassifier()
    etc.fit(feature, target)

    return etc

In [102]:
# xgboost splitting
feature, target = split_df(train_df)

# x_train, x_test, y_train, y_test = train_test_split(feature, target)


In [103]:
# fit model on training data
model = XGBClassifier()
model.fit(feature, target)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [104]:
# make predictions for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

In [105]:
# evaluate predictions
balanced_accuracy = balanced_accuracy_score(y_test, predictions)
print("Balanced Accuracy Score: %.2f%%" % (balanced_accuracy * 100.0))

recall = recall_score(y_test, predictions)
print("Recall Score: %.2f%%" % (recall * 100.0))

Balanced Accuracy Score: 70.00%
Balanced Accuracy Score: 99.99%


In [81]:
# Training data with ExtraTreesClassifier and split data
extra_tree_model = train_v1(train_df)
feature, target = split_df(train_df)

# running a prediction model on training set
training_pred = extra_tree_model.predict(feature)
acc = accuracy_score(y_true=target, y_pred=training_pred)
acc

# confusion matrix for training
# confusion_matrix(y_true=target, y_pred=training_pred)

1.0

In [83]:
# testing on next years
test_2012 = get_data(dummy_df, 2012, 2014)
feature_2012, target_2012 = split_df(test_2012)

testing_pred_2012 = extra_tree_model.predict(feature_2012)
acc = accuracy_score(y_true=target_2012, y_pred=testing_pred_2012)

# confusion_matrix(y_true=target_2012, y_pred=testing_pred_2012)

0.9731478260869565

In [92]:
def main_function(dummy_df, start, end):
    """
    Takes in dummy dataframe and runs all functions based on given year ranges.
    Prints out year and accuracy per time range (3 years ranges)
    """
    for i in range(start, end - 4):
        start_train = i
        end_train = i + 2
        df = get_data(dummy_df, start_train, end_train)
        feature_train, target_train = split_df(df)
        extra_tree_model = train_v1(df)

        training_pred = extra_tree_model.predict(feature)
        print(start_train, "-", end_train, ": ", accuracy_score(y_true=target, y_pred=training_pred)) 

        # Test
        start_test = i + 3
        end_test = start_test + 2
        test = get_data(dummy_df, start_test, end_test)
        feature_test, target_test = split_df(test)
        testing_pred = extra_tree_model.predict(feature_test)
        print(start_test, "-", end_test, ": ", accuracy_score(y_true=target_test, y_pred=testing_pred))
        

In [93]:
main_function(dummy_df, 2009, 2019)

2009 - 2011 :  1.0
2012 - 2014 :  0.9732347826086957
2010 - 2012 :  0.9944101258611788
2013 - 2015 :  0.9717936271280468
2011 - 2013 :  0.993021558399943
2014 - 2016 :  0.974964457852214
2012 - 2014 :  0.9908140921795169
2015 - 2017 :  0.9755629345231919
2013 - 2015 :  0.9906716750040054
2016 - 2018 :  0.9785052394561358
2014 - 2016 :  0.9906538728570665
2017 - 2019 :  0.979710396254227
2015 - 2017 :  0.9904224449468606
2018 - 2020 :  0.9822378254371025
2016 - 2018 :  0.9905826642693109
2019 - 2021 :  0.9887457555921167
