In [64]:
import pandas as pd
import datetime as datetime
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


In [23]:
df = pd.read_csv("FinalFinal.csv")

df.columns
# select non join columns and ML needed columns
columns = ['MNL_FEAT_1', 'MNL_MATE_1', 'MNL_LENGTH', 'MNL_INSTAL',
       'Width', 'NEAR_DIST', 'MUSYM', 'ARTCLASS', 'BLOCKNBR', 
       'SPEEDLIMIT', 'SURFACEWID', 'SURFACETYP', 'SLOPE_PCT', 'Size', 'DATE']
non_joins = df[columns]
# non_joins.head()

In [24]:
# select only years
non_joins['MNL_INSTAL'] = pd.to_datetime(non_joins['MNL_INSTAL'], format='%m/%d/%Y %H:%M:%S')
non_joins['MNL_INSTAL'] = non_joins['MNL_INSTAL'].map(lambda x: x.year)

non_joins['DATE'] = pd.to_datetime(non_joins['DATE'], format='%m/%d/%Y %H:%M:%S')
non_joins['DATE'] = non_joins['DATE'].map(lambda x: x.year)

#non_joins.head()

In [26]:
# filter out NaN from DATE col
breaks_df = non_joins[non_joins['DATE'].notna()]
# set size column to width column and drop size 
breaks_df['Width'] = breaks_df['Size']
breaks_df = breaks_df.drop('Size', axis=1)
# breaks_df.head()

## ML Pseudo Steps
#### 1) assign binary variable --> can be raw data set
#### 2) create dummy variables for categorical columns
* make sure soil column ['MUSYM'] is categorical
#### 3) list of years (and cycle through those)
#### 4) will have six year groups :
* train on [2009, 2010, 2011], test on [2012, 2013, 2014]
* move onto [2010, 2011, 2012], test on [2013, 2014, 2015], etc ...
#### 5) create function to create subset of data
* want to include where there is NOT a break year (those will be our non-broken positive examples)
* want to include where break year is in time frame of what we want
* exclude installs AFTER time frame window
* based on time window, calculate appropriate age of pipes 
    * (select beginning year of time frame --> ex: [2009, 2010, 2011] subtract install year from 2009)


In [28]:
pseudo_df = non_joins
# pseudo_df.head()

# fill Nan with 0 (idk why, but it just made it work FOR NOW)
# if date is not 0 (NaN), make width = size 
# then for all df, drop size column

pseudo_df['DATE'] = pseudo_df['DATE'].fillna(0)
pseudo_df.loc[pseudo_df['DATE'] != 0, ['Width']] = pseudo_df['Size']
pseudo_df = pseudo_df.drop('Size', axis=1)
# pseudo_df.head()

# change 'Width' values to numbers instead of strings (for dummy prep)
pseudo_df['Width'] = pd.to_numeric(pseudo_df['Width'], errors='coerce')
# pseudo_df['Width'].unique()

# Assign binary variables:
# [Create new column] If pipe has a broken date --> broken pipes = 0, non-broken pipes = 1
pseudo_df['TARGET'] = pseudo_df['DATE'].apply(lambda x: 1 if x == 0 else 0)

In [31]:
# Create dummy variables
dummy_df = pd.get_dummies(pseudo_df)
# test_df.columns

In [32]:
def get_data(df, start, end):
    """
    Takes in df and filters depending on timeframe (start and end years).
    Returns subset of data for training in timeframe.
    """
    # want to include where there is NOT a break year (those will be our non-broken positive examples --> 'DATE' == 0) - DATE col
    # want to include where break year is in time frame of what we want - DATE col
    train_df = df[(df['DATE'] == 0 )| ((df['DATE'] >= start) & (df['DATE'] <= end))]

    # exclude installs AFTER time frame window - MNL_INSTAL col
    train_df = train_df[(train_df['MNL_INSTAL'] <= end)]

    # based on time window, calculate appropriate age of pipes (select beginning year of time frame --> ex: 2009, 2010, 2011
    #       subtract install year from 2009) - MNL_INSTAL col
    # -- will create negative numbers
    train_df['AGE'] = start - train_df['MNL_INSTAL']

    return train_df


In [45]:
train_df = get_data(dummy_df, 2009, 2011)
# train_df.index

In [74]:
# trains
# tests
# move over
# rinse & repeat until 2019
def split_df(df):
    """
    """    
    df = df.dropna()
    feature_df = df.drop(['TARGET', 'DATE'], axis=1)
    target_df = df[['TARGET']]

    return feature_df, target_df

def train_v1(df):
    """
    """
    feature, target = split_df(df)
    etc = ExtraTreesClassifier()
    etc.fit(feature, target)

    return etc

In [81]:
# Training data with ExtraTreesClassifier and split data
extra_tree_model = train_v1(train_df)
feature, target = split_df(train_df)

# running a prediction model on training set
training_pred = extra_tree_model.predict(feature)
acc = accuracy_score(y_true=target, y_pred=training_pred)
acc

# confusion matrix for training
# confusion_matrix(y_true=target, y_pred=training_pred)

1.0

In [83]:
# testing on next years
test_2012 = get_data(dummy_df, 2012, 2014)
feature_2012, target_2012 = split_df(test_2012)

testing_pred_2012 = extra_tree_model.predict(feature_2012)
acc = accuracy_score(y_true=target_2012, y_pred=testing_pred_2012)

# confusion_matrix(y_true=target_2012, y_pred=testing_pred_2012)

0.9731478260869565

In [92]:
def main_function(dummy_df, start, end):
    """
    Takes in dummy dataframe and runs all functions based on given year ranges.
    Prints out year and accuracy per time range (3 years ranges)
    """
    for i in range(start, end - 4):
        start_train = i
        end_train = i + 2
        df = get_data(dummy_df, start_train, end_train)
        feature_train, target_train = split_df(df)
        extra_tree_model = train_v1(df)

        training_pred = extra_tree_model.predict(feature)
        print(start_train, "-", end_train, ": ", accuracy_score(y_true=target, y_pred=training_pred)) 

        # Test
        start_test = i + 3
        end_test = start_test + 2
        test = get_data(dummy_df, start_test, end_test)
        feature_test, target_test = split_df(test)
        testing_pred = extra_tree_model.predict(feature_test)
        print(start_test, "-", end_test, ": ", accuracy_score(y_true=target_test, y_pred=testing_pred))
        

In [93]:
main_function(dummy_df, 2009, 2019)

2009 - 2011 :  1.0
2012 - 2014 :  0.9732347826086957
2010 - 2012 :  0.9944101258611788
2013 - 2015 :  0.9717936271280468
2011 - 2013 :  0.993021558399943
2014 - 2016 :  0.974964457852214
2012 - 2014 :  0.9908140921795169
2015 - 2017 :  0.9755629345231919
2013 - 2015 :  0.9906716750040054
2016 - 2018 :  0.9785052394561358
2014 - 2016 :  0.9906538728570665
2017 - 2019 :  0.979710396254227
2015 - 2017 :  0.9904224449468606
2018 - 2020 :  0.9822378254371025
2016 - 2018 :  0.9905826642693109
2019 - 2021 :  0.9887457555921167
