## This notebook describes my imputation procedure for filling in missing values in my edge properties.

In [1]:
# Imports and miscellany

import csv
import itertools
import pickle
import string
import graphHandler as gh
import inventoryCleaner as iC
import networkx as nx
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

%matplotlib inline

# Helper functions

def representsInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def buildRange(x):
    result = []
    if pd.isnull(x)==True:
        return np.nan
    for part in x.split(','):
        if '-' in part:
            a, b = part.split('-')
            if (representsInt(a)==False):
                strarr = []
                for i in range(ord(a), ord(b)+1):
                    strarr.append(chr(i))
                result.extend(strarr)
            else:
                a, b = int(a), int(b)
                result.extend(range(a, b + 1))
        else:
            if (representsInt(part)==False):
                result.append(part)
            else:
                a = int(part)
                result.append(a)
    return result

def existin(x):
    r = reqs[reqs['Variable']==x]['Reasonable range'].tolist()
    return r

def returnProps(net, u, v, prop):
    return net[u][v][prop]

def returnNeighborProps(net,u,v,props):
    propertyTracker = {el:[] for el in props}
    neighbors = net.neighbors(u)
    for n in neighbors:
        for p in props:
            newp = returnProps(net, u, n, p)
            if (type(newp)=='NoneType')==False:
                propertyTracker[p].append(newp)
    neighbors = net.neighbors(v)
    for n in neighbors:
        for p in props:
            newp = returnProps(net, n, v, p)
            if (type(newp)=='NoneType')==False:
                propertyTracker[p].append(newp)
    return propertyTracker

def getConsensus(props, reqs):
# This function takes in a dict of missing properties and returns the appropriate value after
# accounting for categorical or numerical.
    bestGuess = {el:0 for el in props}
    for key in props:
        iscategorical = reqs.ix[reqs['Variable']==key, 'Categorical']
        if iscategorical.values[0]:
            bestGuess[key]=np.random.choice(props[key])
        else:
            bestGuess[key]=np.mean(props[key])
    return bestGuess

In [2]:
# The first step is to bring in a CSV full of properties and requirements for my data set. 

pathtoreqs = '/home/louisf/Documents/Insight/massdriver/docs/road_inventory_params.csv'
reqs = pd.read_csv(pathtoreqs, delimiter = '^')

In [3]:
# Do a little bit of cleaning to convert some stuff and drop some rows
reqs['Reasonable range'] = reqs['Reasonable range'].map(buildRange)
reqs = reqs[reqs['Disqualify']==False]

In [4]:
# Load in my database 
fpath = '/home/louisf/Documents/Insight/massdriver/data/raw/shapefile/RoadInventory.geojson'
pdinv = iC.PDInventory('roadInventory', 'test')
pdinv.geojsontodf(fpath)

toy = pdinv.table
for colname in toy.columns:
    if (sum(colname==reqs['Variable'])!=1):
        toy.drop(colname, inplace=True, axis=1)
        


In [None]:
# Check on how many of the remaining features exceed some threshold of in-range values.

df = pd.DataFrame(index=toy.columns,columns=['value', 'hasbounds'])

for colname in toy.columns:
    nrange = existin(colname)[0]
    if type(nrange)!=float:
        df.loc[colname]= sum(toy[colname].isin(nrange))
        
for row in df.index:
    r = existin(row)[0]
    if type(r) == float:
        df.loc[row]['hasbounds'] = False
    else:
        df.loc[row]['hasbounds'] = True

# Examine only entries with bounds for removal:

df.loc[df['hasbounds']==True]
df.loc[df['hasbounds']==True].value.plot(kind='bar')

# Drop all entries with fewer than 10% of values within bounds
df = df[df['value']<50000]

# The indices of df correspond to all of the properties we will drop from our table.

# Based on this inspection, we will drop:
# fromcity
# fromstate
# iristatus
# jurisdicti
# opposingdi
# oppositenu
# tostate

# This removal occurs in road_inventory_params.

In [None]:
# Next, I will load in my graph. I will use the pickle I generated previously.
picklepath = '/home/louisf/Documents/Insight/massdriver/notebooks/graph_with_risk2.pickle'
graph = nx.read_gpickle(picklepath)

In [None]:
newNet = graph.net

In [6]:
# Just kidding! I need to generate a new graph with all of the properties I am interested in.

graph = gh.NetworkGenerator()
filepath = '/home/louisf/Documents/Insight/massdriver/data/raw/shapefile/RI_converted.shp'
graph.loadGraph(filepath=filepath, fields=toy.columns.tolist(), simplify=True)
#newNet = graph.net

good = 481430, bad = 30


In [9]:
# Iterate over each edge. Find the properties on that edge that are missing. 
importantProperties = toy.columns.tolist()
nrange = reqs['Reasonable range']
it=0
for u,v in graph.net.edges_iter():
    g = graph.net[u][v]
    missingProps = []
    for props in importantProperties:
        nrange = reqs.loc[reqs['Variable']==props]['Reasonable range']
        if len(nrange.tolist())>0:
            if type(nrange.tolist()[0])!=float:
                if ((g[props] in nrange.tolist()[0])==False):
                    missingProps.append(props)
    neighborProps = returnNeighborProps(graph.net, u, v, missingProps)
    newvals = getConsensus(neighborProps, reqs)
    for key, value in newvals.items():
        graph.net[u][v][key]=value
    if(it%10000==0):
        print(it)
    it+=1


0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000


In [8]:
graph.net[u][v]

{'Json': '{ "type": "LineString", "coordinates": [ [ -70.153843479591174, 41.706992727510141 ], [ -70.153932944523177, 41.707016547034755 ], [ -70.154031954609749, 41.70702533890438 ], [ -70.154184807948582, 41.707013804006941 ], [ -70.154399251777008, 41.706976316858956 ], [ -70.154539872460305, 41.706941764806857 ], [ -70.154698006664574, 41.706920762011656 ], [ -70.154996279821475, 41.706888072748512 ], [ -70.155342587035804, 41.706869292625669 ], [ -70.155474978634061, 41.706863231118874 ], [ -70.155515500053895, 41.706871335354428 ] ] }',
 'ShpName': 'RI_converted',
 'Wkb': b'\x00\x00\x00\x00\x02\x00\x00\x00\x0b\xc0Q\x89\xd8\x92Rc\x01@D\xda~\xbc\xd9\x95<\xc0Q\x89\xda\t\x90\xa0B@D\xda\x7f\x84\xa9\x9f`\xc0Q\x89\xdb\xa8\xd7\xe5\x9f@D\xda\x7f\xcej\x04\xd1\xc0Q\x89\xde)\xf4\xebx@D\xda\x7fm\xa7\x03\xe3\xc0Q\x89\xe1\xadf:/@D\xda~3/\xfaV\xc0Q\x89\xe3\xfb4\x89H@D\xda}\x11X\x02\xd0\xc0Q\x89\xe6\x92w\xd8^@D\xda|a(\xda\x0b\xc0Q\x89\xebu\x845o@D\xda{N\xf11\x94\xc0Q\x89\xf1"\x08\xbf\xab@D\xdaz\

In [11]:
dictlist = []

for u,v in graph.net.edges_iter():
    nd = {}
    for var in reqs['Variable']:
        nd[var]=graph.net[u][v][var]
    dictlist.append(nd)
    

In [12]:
newDB = pd.DataFrame(dictlist)

In [13]:
newDB.to_csv('/home/louisf/Documents/Insight/massdriver/data/intermediateGraphDB.csv')

In [14]:
nx.write_gpickle(graph.net, '/home/louisf/Documents/Insight/massdriver/data/largeGraph_filled.pickle')

In [26]:
nx.write_shp(graph.net, '/home/louisf/Documents/Insight/massdriver/data/')

NotImplementedError: Wrong number of arguments for overloaded function 'Feature_SetField'.
  Possible C/C++ prototypes are:
    SetField(OGRFeatureShadow *,int,char const *)
    SetField(OGRFeatureShadow *,char const *,char const *)
    SetField(OGRFeatureShadow *,int,double)
    SetField(OGRFeatureShadow *,char const *,double)
    SetField(OGRFeatureShadow *,int,int,int,int,int,int,float,int)
    SetField(OGRFeatureShadow *,char const *,int,int,int,int,int,float,int)


In [None]:
newNet = nx.read_gpickle('/home/louisf/Documents/Insight/massdriver/data/largeGraph.npickle')

In [None]:
# For each node on the edge, find the properties of the edges attached to that node.

In [None]:
# Find all properties that are with an acceptable range within that set, and use those to impute missing value.