In [1]:
import pandas as pd
import datetime as datetime
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


In [12]:
df = pd.read_csv("Pipes_Break20.csv")

df.columns
# select non join columns and ML needed columns
columns = ['DWW_Mainlines__Permitted_Use__MNL_FEAT_1', 'DWW_Mainlines__Permitted_Use__MNL_MATE_1', 
       'DWW_Mainlines__Permitted_Use__MNL_LENGTH', 'DWW_Mainlines__Permitted_Use__MNL_INSTAL',
       'Pipe_widths_Width', 'NEAR_DIST', 'MUSYM', 'ARTCLASS', 'BLOCKNBR', 
       'SPEEDLIMIT', 'SURFACEWID', 'SURFACETYP', 'SLOPE_PCT', 'Size', 'DATE']
non_joins = df[columns]
non_joins.head()

Unnamed: 0,DWW_Mainlines__Permitted_Use__MNL_FEAT_1,DWW_Mainlines__Permitted_Use__MNL_MATE_1,DWW_Mainlines__Permitted_Use__MNL_LENGTH,DWW_Mainlines__Permitted_Use__MNL_INSTAL,Pipe_widths_Width,NEAR_DIST,MUSYM,ARTCLASS,BLOCKNBR,SPEEDLIMIT,SURFACEWID,SURFACETYP,SLOPE_PCT,Size,DATE
0,Mainline,Concrete,314.79,1/1/1972 0:00:00,8.0,3964.60072,3055,2.0,9400.0,25.0,40.0,PCC,0.0,,
1,Mainline,Concrete,363.39,1/1/1972 0:00:00,8.0,3609.210948,3056,0.0,0.0,20.0,46.0,AC,4.0,,
2,Mainline,Concrete,323.51,1/1/1972 0:00:00,8.0,3451.254435,3056,0.0,0.0,20.0,46.0,AC,4.0,,
3,Mainline,Vitrified Clay,329.13,1/1/1928 0:00:00,12.0,833.915482,3056,0.0,5100.0,20.0,0.0,ST,6.0,,
4,Mainline,Reinforced Concrete Pipe,273.64,1/1/1928 0:00:00,18.0,852.426502,3056,2.0,10300.0,25.0,42.0,AC/PCC,4.0,,


In [14]:
# select only years
non_joins['DWW_Mainlines__Permitted_Use__MNL_INSTAL'] = pd.to_datetime(non_joins['DWW_Mainlines__Permitted_Use__MNL_INSTAL'], format='%m/%d/%Y %H:%M:%S')
non_joins['DWW_Mainlines__Permitted_Use__MNL_INSTAL'] = non_joins['DWW_Mainlines__Permitted_Use__MNL_INSTAL'].map(lambda x: x.year)

non_joins['DATE'] = pd.to_datetime(non_joins['DATE'], format='%m/%d/%Y %H:%M:%S')
non_joins['DATE'] = non_joins['DATE'].map(lambda x: x.year)

#non_joins.head()

In [15]:
# filter out NaN from DATE col
breaks_df = non_joins[non_joins['DATE'].notna()]
# set size column to width column and drop size 
breaks_df['Pipe_widths_Width'] = breaks_df['Size']
breaks_df = breaks_df.drop('Size', axis=1)
# breaks_df.head()

## ML Pseudo Steps
#### 1) assign binary variable --> can be raw data set
#### 2) create dummy variables for categorical columns
* make sure soil column ['MUSYM'] is categorical
#### 3) list of years (and cycle through those)
#### 4) will have six year groups :
* train on [2009, 2010, 2011], test on [2012, 2013, 2014]
* move onto [2010, 2011, 2012], test on [2013, 2014, 2015], etc ...
#### 5) create function to create subset of data
* want to include where there is NOT a break year (those will be our non-broken positive examples)
* want to include where break year is in time frame of what we want
* exclude installs AFTER time frame window
* based on time window, calculate appropriate age of pipes 
    * (select beginning year of time frame --> ex: [2009, 2010, 2011] subtract install year from 2009)


In [16]:
pseudo_df = non_joins
# pseudo_df.head()

# fill Nan with 0 (idk why, but it just made it work FOR NOW)
# if date is not 0 (NaN), make width = size 
# then for all df, drop size column

pseudo_df['DATE'] = pseudo_df['DATE'].fillna(0)
pseudo_df.loc[pseudo_df['DATE'] != 0, ['Pipe_widths_Width']] = pseudo_df['Size']
pseudo_df = pseudo_df.drop('Size', axis=1)
# pseudo_df.head()

# change 'Width' values to numbers instead of strings (for dummy prep)
pseudo_df['Width'] = pd.to_numeric(pseudo_df['Pipe_widths_Width'], errors='coerce')
# pseudo_df['Width'].unique()

# Assign binary variables:
# [Create new column] If pipe has a broken date --> broken pipes = 0, non-broken pipes = 1
pseudo_df['TARGET'] = pseudo_df['DATE'].apply(lambda x: 1 if x == 0 else 0)

In [17]:
# Create dummy variables
dummy_df = pd.get_dummies(pseudo_df)
# test_df.columns

In [20]:
def get_data(df, start, end):
    """
    Takes in df and filters depending on timeframe (start and end years).
    Returns subset of data for training in timeframe.
    """
    # want to include where there is NOT a break year (those will be our non-broken positive examples --> 'DATE' == 0) - DATE col
    # want to include where break year is in time frame of what we want - DATE col
    train_df = df[(df['DATE'] == 0 )| ((df['DATE'] >= start) & (df['DATE'] <= end))]

    # exclude installs AFTER time frame window - MNL_INSTAL col
    train_df = train_df[(train_df['DWW_Mainlines__Permitted_Use__MNL_INSTAL'] <= end)]

    # based on time window, calculate appropriate age of pipes (select beginning year of time frame --> ex: 2009, 2010, 2011
    #       subtract install year from 2009) - MNL_INSTAL col
    # -- will create negative numbers
    train_df['AGE'] = start - train_df['DWW_Mainlines__Permitted_Use__MNL_INSTAL']

    return train_df


In [21]:
train_df = get_data(dummy_df, 2009, 2011)
# train_df.index

In [22]:
# trains
# tests
# move over
# rinse & repeat until 2019
def split_df(df):
    """
    """    
    df = df.dropna()
    feature_df = df.drop(['TARGET', 'DATE'], axis=1)
    target_df = df[['TARGET']]

    return feature_df, target_df

def train_v1(df):
    """
    """
    feature, target = split_df(df)
    etc = ExtraTreesClassifier()
    etc.fit(feature, target)

    return etc

In [23]:
# Training data with ExtraTreesClassifier and split data
extra_tree_model = train_v1(train_df)
feature, target = split_df(train_df)

# running a prediction model on training set
training_pred = extra_tree_model.predict(feature)
acc = accuracy_score(y_true=target, y_pred=training_pred)
acc

# confusion matrix for training
# confusion_matrix(y_true=target, y_pred=training_pred)

1.0

In [28]:
# testing on next years
test_2012 = get_data(dummy_df, 2012, 2014)
feature_2012, target_2012 = split_df(test_2012)

testing_pred_2012 = extra_tree_model.predict(feature_2012)
acc = accuracy_score(y_true=target_2012, y_pred=testing_pred_2012)

cm = confusion_matrix(y_true=target_2012, y_pred=testing_pred_2012)


In [39]:
cm[0,0] / (cm[0,0] + cm[0, 1])

0.5986394557823129

In [42]:
def main_function(dummy_df, start, end):
    """
    Takes in dummy dataframe and runs all functions based on given year ranges.
    Prints out year and accuracy per time range (3 years ranges)
    """
    for i in range(start, end - 4):
        start_train = i
        end_train = i + 2
        df = get_data(dummy_df, start_train, end_train)
        feature_train, target_train = split_df(df)
        extra_tree_model = train_v1(df)

        training_pred = extra_tree_model.predict(feature)
        #print(start_train, "-", end_train, ": ", accuracy_score(y_true=target, y_pred=training_pred)) 

        # Test
        start_test = i + 3
        end_test = start_test + 2
        test = get_data(dummy_df, start_test, end_test)
        feature_test, target_test = split_df(test)
        testing_pred = extra_tree_model.predict(feature_test)
        #print(start_test, "-", end_test, ": ", accuracy_score(y_true=target_test, y_pred=testing_pred))
        
        # Print out break and non break accuracy's rather total full accuracy
        cm = confusion_matrix(y_true=target_test, y_pred=testing_pred)
        break_acc = cm[0,0] / (cm[0,0] + cm[0,1])
        print('Break accuracy: ', break_acc)
        nonbreak_acc = cm[1,1] / (cm[1, 0] + cm[1,1])
        print('Non-breal accuracy: ', nonbreak_acc)

In [43]:
main_function(dummy_df, 2009, 2019)

Break accuracy:  0.5986394557823129
Non-breal accuracy:  1.0
Break accuracy:  0.5738498789346247
Non-breal accuracy:  1.0
Break accuracy:  0.6211401425178147
Non-breal accuracy:  1.0
Break accuracy:  0.5989010989010989
Non-breal accuracy:  1.0
Break accuracy:  0.6025200458190149
Non-breal accuracy:  1.0
Break accuracy:  0.6052009456264775
Non-breal accuracy:  1.0
