In [68]:
import pandas as pd
import datetime as datetime

In [90]:
df = pd.read_csv("FinalFinal.csv")

In [91]:
df.columns
# select non join columns and ML needed columns
columns = ['MNL_FEAT_1', 'MNL_MATE_1', 'MNL_LENGTH', 'MNL_INSTAL',
       'Width', 'NEAR_DIST', 'MUSYM', 'ARTCLASS', 'BLOCKNBR', 
       'SPEEDLIMIT', 'SURFACEWID', 'SURFACETYP', 'SLOPE_PCT', 'Size', 'DATE']
non_joins = df[columns]
# non_joins.head()

In [92]:
# select only years
non_joins['MNL_INSTAL'] = pd.to_datetime(non_joins['MNL_INSTAL'], format='%m/%d/%Y %H:%M:%S')
non_joins['MNL_INSTAL'] = non_joins['MNL_INSTAL'].map(lambda x: x.year)

non_joins['DATE'] = pd.to_datetime(non_joins['DATE'], format='%m/%d/%Y %H:%M:%S')
non_joins['DATE'] = non_joins['DATE'].map(lambda x: x.year)

#non_joins.head()

In [93]:
# filter out NaN from DATE col
breaks_df = non_joins[non_joins['DATE'].notna()]
# set size column to width column and drop size 
breaks_df['Width'] = breaks_df['Size']
breaks_df = breaks_df.drop('Size', axis=1)
# breaks_df.head()

## ML Pseudo Steps
#### 1) assign binary variable --> can be raw data set
#### 2) create dummy variables for categorical columns
* make sure soil column ['MUSYM'] is categorical
#### 3) list of years (and cycle through those)
#### 4) will have six year groups :
* train on [2009, 2010, 2011], test on [2012, 2013, 2014]
* move onto [2010, 2011, 2012], test on [2013, 2014, 2015], etc ...
#### 5) create function to create subset of data
* want to include where there is NOT a break year (those will be our non-broken positive examples)
* want to include where break year is in time frame of what we want
* exclude installs AFTER time frame window
* based on time window, calculate appropriate age of pipes 
    * (select beginning year of time frame --> ex: [2009, 2010, 2011] subtract install year from 2009)


In [94]:
pseudo_df = non_joins
# pseudo_df.head()

In [95]:
# fill Nan with 0 (idk why, but it just made it work FOR NOW)
# if date is not 0 (NaN), make width = size 
# then for all df, drop size column

pseudo_df['DATE'] = pseudo_df['DATE'].fillna(0)
pseudo_df.loc[pseudo_df['DATE'] != 0, ['Width']] = pseudo_df['Size']
pseudo_df = pseudo_df.drop('Size', axis=1)
# pseudo_df.head()

In [107]:
# change 'Width' values to numbers instead of strings (for dummy prep)
pseudo_df['Width'] = pd.to_numeric(pseudo_df['Width'], errors='coerce')
# pseudo_df['Width'].unique()

array([  8.  ,  12.  ,  18.  ,  30.  ,   6.  ,  24.  ,  15.  ,  27.  ,
        10.  ,  48.  ,  72.  ,  42.  ,  36.  ,   4.  ,  90.  ,   2.  ,
        20.  ,  21.  , 108.  ,  60.  ,  16.  , 102.  ,  54.  ,  96.  ,
        14.  , 120.  ,  66.  ,  84.  , 138.  ,   0.  ,  78.  ,   1.  ,
       114.  , 126.  , 144.  ,   3.  ,   1.5 , 150.  ,  50.  ,  64.  ,
        31.  , 132.  ,    nan,  35.  ,  28.  ,  22.  ,  40.  ,  70.  ,
        33.  ,  32.  ,   0.75,  29.  ,  34.  ,  26.  ,   9.  ,  23.  ,
        44.  , 118.  ,  49.  ,  39.  ,  38.  ,  75.  ,  43.  ,  51.5 ,
        58.  ,  65.  ,  17.  , 176.  ])

In [79]:
# Assign binary variables:
# [Create new column] If pipe has a broken date --> broken pipes = 0, non-broken pipes = 1
pseudo_df['TARGET'] = pseudo_df['DATE'].apply(lambda x: 1 if x == 0 else 0)

In [108]:
# Create dummy variables
dummy_df = pd.get_dummies(pseudo_df)
# test_df.columns

In [136]:
def get_training_data(df, start, end):
    """
    Takes in df and filters depending on timeframe (start and end years).
    Returns subset of data for training in timeframe.
    """
    # want to include where there is NOT a break year (those will be our non-broken positive examples --> 'DATE' == 0) - DATE col
    # want to include where break year is in time frame of what we want - DATE col
    train_df = df[(df['DATE'] == 0 )| ((df['DATE'] >= start) & (df['DATE'] <= end))] 

    # exclude installs AFTER time frame window - MNL_INSTAL col
    train_df = train_df[(train_df['MNL_INSTAL'] <= end)]

    # based on time window, calculate appropriate age of pipes (select beginning year of time frame --> ex: 2009, 2010, 2011
    #       subtract install year from 2009) - MNL_INSTAL col
    # -- will create negative numbers
    train_df['AGE'] = start - train_df['MNL_INSTAL']

    return train_df


In [137]:
train_df = get_training_data(dummy_df, 2009, 2011)