In [1]:
# Import Statements 
####################
import os 
import sys
import copy
import pandas as pd
import pathlib as pl
import numpy as np
# import matplotlib.pyplot as plt
# from PIL import Image, ImageDraw
import pathlib
import datetime
import itertools as ite 
import math
import calendar

import shapefile  #conda install -c conda-forge pyshp    # (version should be 2.0)
from shapely.geometry import Point   #conda install -c conda-forge shapely
from shapely.geometry import shape


NUM_TIME_BINS_PER_DAY = 24

In [2]:
_="""

Define any useful functions 

"""

def cleanURL(url):
    p = pathlib.Path(url)
    path = str(p.as_posix()) 
    return path 


def getDF(loc, sheetname):
    dataframe = pd.read_excel(loc, sheetname)
    #https://stackoverflow.com/questions/40950310/strip-trim-all-strings-of-a-dataframe
    dataframe = dataframe.applymap(lambda x: x.strip() if type(x) is str else x)
    return dataframe

def printNulls(df):
    null_columns = df.columns[df.isnull().any()]
    return df[null_columns].isnull().sum() 


def writeDFToFile(dfs, path_): #dfs is an array of dataframes and their sheet names , path needs to have
    time_ = str(datetime.datetime.now())
    current_date_time = time_[0:time_.index(".")]
    current_date_time = current_date_time.replace(":", "-")
    task4_fileoutput = path_+current_date_time+".xlsx"

    writer = pd.ExcelWriter(task4_fileoutput)
    
    for df_tuple in dfs:  
        df = df_tuple[0]
        sheetName = df_tuple[1]
        df.to_excel(writer, sheetName)
    print("file written to :       " + task4_fileoutput)
    writer.save()



In [3]:
# sf.shapes()[0].__geo_interface__
# feature = sf.shapeRecords()[0]
# feature.record
# feature.shape.shapeTypeName

##check if all the shapes are valid polygons in a shapefile 
def checkIfShapeFileFilledWithPolygons(sfile):
    for shape_ in sfile.shapes():
        assert(shape_.shapeType == 5) # the shape.shapeTypeName is 'POLYGON'


#https://gis.stackexchange.com/questions/250172/finding-out-if-coordinate-is-within-shapefile-shp-using-pyshp
def return_Category_A_Point_Belongs_In(colIndex, sfile, point):  # (longitude,latitude) wise 
    features = sf.shapeRecords()
    record_ = None
    shape_boundary = None
    for i, feature in enumerate(features):
        record_ = feature.record
        shape_boundary = feature.shape
        if(Point(point).within(shape(shape_boundary))):
            return record_[colIndex], shape_boundary
    #if none found, it equals none 
    return None, None

def getShapeThatCorrespondsToCategory(colIndex, category, sfile):
    records = sfile.records()
    for i, record in enumerate(records):
#         print(i)
        if(record[colIndex] == category):
            return sfile.shapes()[i]
        
    #if none found, it equals none 
    return None


#optimized for 30x speed up over other functions 
#list_of_categories must be list of ints
#colIndex must be an integer
def maskGISPolygonsData(colIndex, list_of_categories, iHeight, iWidth,sf ): # sf is shapes file 
    iHeight = int(iHeight)
    iWidth = int(iWidth)

    records = sf.records()
    actualCategoriesFoundInShapeFile = []
    for record in records:
        category = int(record[colIndex])
        actualCategoriesFoundInShapeFile.append(category)

    shapesOfShapeFile = sf.shapes()
    # now that we have the categories the data provides, we loop through the categories the data has
    # if there are categories that the data has that shape file does not => leave None in that list 
    shapes= []
    #shapes = sf.shapes()
    list_ = list_of_categories
    lookup_ = {}

    list_ = list_of_categories
    locationInShapeFile = None 
    #look up dictionary 
    for i, ele in enumerate(list_):
        if(ele in actualCategoriesFoundInShapeFile):
            locationInShapeFile = actualCategoriesFoundInShapeFile.index(ele)
            shapes.append(shapesOfShapeFile[locationInShapeFile])
        else: 
            shapes.append(None) 
        lookup_[i+1] = ele #we are saying that this category resides in i+1 in the Masks table  

    assert(len(shapes) == len(lookup_))
    #set the masks
    MASKS= np.zeros((len(lookup_)+1, iHeight, iWidth)) # we add a category layer and hence we added the +1 -> this allows us to have a -1 category 

    ## look up dictionary for points 
    pointsLU = {} # points look up 

    for row in range(iHeight):
        for col in range(iWidth):

            colMid = col+.5 # make the latitude and longitude be in the middle of the pixel
            rowMid = iHeight - row - 1 +.5 # double check for this to work 
            #rowMid = row +.5
            
            longVal = (colMid-0.)*(1./longMultiplier)+longMin2
            latVal = (rowMid -0.)*(1./latMultiplier)+latMin2
            #point_ = (longVal,latVal)

            #plug in latVal and longVal
            pointsLU[(row, col)] = Point((longVal, latVal))


    #now for each category, apply the row col to get the mask 

    for ishape, shape_ in enumerate(shapes):
        if(type(shape_) == type(None)): #we didn't find a shape for the element at this index
            print("found no shape")
            continue # no shape -> this category cannot compete in the code 
        boundary = shape(shape_)
        for row in range(iHeight):
            for col in range(iWidth):
                if(len(np.where(MASKS[:,row,col] ==1)[0]) == 1):
                    continue
            
                #now check if point exists within boundary 
                if(pointsLU[(row, col)].within(boundary)):  
                    MASKS[ishape+1, row, col] = 1.   # shift by +1 because at 0 we will have -1 layer
    
    for row in range(iHeight):
        for col in range(iWidth):
            if(len(np.where(MASKS[:,row,col] ==1)[0]) == 0): # if nothing found for this row, col we put it in the null category 
                MASKS[0, row, col] = 1.
        
    
    return lookup_ , MASKS 

def convertImageToCategoryMask(colIndex, categoryVal, sfile ,iHeight , iWidth ,latMultiplier, longMultiplier, latMin2,longMin2 ):
    mask = np.zeros(shape=(iHeight, iWidth))
    shape_that_points_should_be_in = getShapeThatCorrespondsToCategory(colIndex, categoryVal, sfile)   
    
    #if None is returned, the shapefile didn't have that category and therefore this category has no mask, no point belongs
    if(type(shape_that_points_should_be_in) == type(None)):
        return mask # KLUDGE: should we have a mask of -1's since this category is not available?  or -1 when point belongs to no category
    
    boundary = shape(shape_that_points_should_be_in)
    
    for row in range(iHeight):
        for col in range(iWidth):
            colMid = col+.5 # make the latitude and longitude be in the middle of the pixel
            #rowMid = iHeight - row - 1 +.5 # double check for this to work 
            rowMid = row +.5
            
            longVal = (colMid-0.)*(1./longMultiplier)+longMin2
            latVal = (rowMid -0.)*(1./latMultiplier)+latMin2
            point_ = (longVal,latVal)
            
            if(Point(point_).within(boundary)):
                mask[row][col]= 1 
            
    return mask 

def convertImageToCategoryMask2(colIndex,  sfile ,iHeight  , iWidth ,latMultiplier, longMultiplier, latMin2,longMin2 ):
    iHeight = int(iHeight)
    iWidth = int(iWidth)
    mask = np.zeros(shape=(iHeight, iWidth))
#     shape_that_points_should_be_in = getShapeThatCorrespondsToCategory(colIndex, categoryVal, sfile)   
    
#     #if None is returned, the shapefile didn't have that category and therefore this category has no mask, no point belongs
#     if(type(shape_that_points_should_be_in) == type(None)):
#         return mask # KLUDGE: should we have a mask of -1's since this category is not available?  or -1 when point belongs to no category
    
#     boundary = shape(shape_that_points_should_be_in)
    
    for row in range(iHeight):
        for col in range(iWidth):
            colMid = col+.5 # make the latitude and longitude be in the middle of the pixel
            #rowMid = iHeight - row - 1 +.5 # double check for this to work 
            rowMid = row +.5
            
            longVal = (colMid-0.)*(1./longMultiplier)+longMin2
            latVal = (rowMid -0.)*(1./latMultiplier)+latMin2
            point_ = (longVal,latVal)
            
            r,s = return_Category_A_Point_Belongs_In(colIndex, sfile, point_)
            
            if(type(r) ==type(None)):
                r = -1
                
            r = int(r)
            mask[row][col]= r                
            
    return mask 

# # Test 1
# r,s = return_Category_A_Point_Belongs_In(2, sf, point_) # for community areas -> the 2nd column gives the comm area
# print(r)
# print(s)
# ############works!!!  returns category 25 which is the right value for community area for that point!!!!! 


# # Test 2
# sampleRecord = sf.records()[14]
# realShape = sf.shapes()[14]
# colWanted = sampleRecord[2] # for Community area -> look at column 2 
# shape_ = getShapeThatCorrespondsToCategory(2, colWanted, sf)

# # shape_.equals(realShape)
# # print(realShape.__geo_interface__['coordinates'])
# # print(shape_.__geo_interface__['coordinates'])
# a =shape_.__geo_interface__['coordinates']
# b =realShape.__geo_interface__['coordinates']
# print(a ==b ) #should be true



# Step 1:  Load in the final_dataframe dataset (the crimes that have been filtered/preprocessed)

In [4]:
# final_dataframe.to_csv(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\data generated checkpoint/final_dataframe_after_removing_extreme_vals.csv') , sep = ',' )
# final_dataframe = pd.read_csv(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\data generated checkpoint/final_dataframe_after_removing_extreme_vals.csv'))
final_dataframe = pd.read_csv(cleanURL(r'C:\Users\User\Documents\CS230 Project\new_github\final_dataframe_after_removing_extreme_vals.csv'))
final_dataframe['Date'] = pd.to_datetime(final_dataframe['Date'], format ='%Y-%m-%d %H:%M:%S')  #python changes the format when it starts up


#calculate the latitude and longitude mins and maxes
latMax2 = final_dataframe.Latitude.max()
latMin2 = final_dataframe.Latitude.min()
longMax2  = final_dataframe.Longitude.max()
longMin2  = final_dataframe.Longitude.min()
print(latMax2 - latMin2)
print(longMax2 - longMin2)


0.3565254240000044
0.24557258000000104


# Step 2. Convert latitude and longitude to pixel values, perform checks

In [5]:
############
# Step 2. Generate Image Size and Scaling Multipliers to convert to image format. Have a 256 by 256 datasize. 
############

iHeight = 256.
iWidth = 256.

# pixel/degree ratios
latMultiplier = (iHeight- 0.)/(latMax2 - latMin2)  
longMultiplier = (iWidth- 0.)/(longMax2 - longMin2)

#convert the latitude and longitude values to the pixel values
final_dataframe['latPixel'] = (final_dataframe.Latitude - latMin2)*latMultiplier
final_dataframe['longPixel'] =(final_dataframe.Longitude - longMin2)*longMultiplier

#pixel values are floats and not integers. type cast
final_dataframe['latPixel'] = final_dataframe['latPixel'].astype(np.int64)
final_dataframe['longPixel'] = final_dataframe['longPixel'].astype(np.int64)

final_dataframe['latPixel'] = iHeight - final_dataframe['latPixel'] - 1  # this is important because matrix row numbers

#clip any values greater the iHeight and iWeight
final_dataframe['latPixel'] = final_dataframe['latPixel'].astype(np.int64)
final_dataframe['longPixel'] = final_dataframe['longPixel'].astype(np.int64)
final_dataframe.loc[(final_dataframe['latPixel'] >= int(iHeight)), 'latPixel'] = int(iHeight) - 1
final_dataframe.loc[(final_dataframe['longPixel'] >= int(iWidth)), 'longPixel'] = int(iWidth) - 1

print('done')

done


# Step 3. Get ready for generating the different types of data

In [6]:
###########
# Step 3. Get ready for the Data Generation Loop 
###########


#initialize values for the for loop 
mindate = final_dataframe.Date.min().date()
maxdate = final_dataframe.Date.max().date()

delta = maxdate - mindate

#https://stackoverflow.com/questions/7274267/print-all-day-dates-between-two-dates
datesInDays = [ mindate + datetime.timedelta(index) for index in range(delta.days+1)] 
timeOfDays = [i for i in range(NUM_TIME_BINS_PER_DAY)]


#we want the cartesian product of days and the time of days -> this is how many images we are making
dates_and_timeOfDays_iterator = ite.product(datesInDays, timeOfDays)
dates_and_timeOfDays = [[z[0], z[1]] for z in dates_and_timeOfDays_iterator]  # has the day and the time of day

#check we got all possible combinations 
assert(len(dates_and_timeOfDays) == NUM_TIME_BINS_PER_DAY * len(datesInDays))

print(len(dates_and_timeOfDays))
#https://stackoverflow.com/questions/13635032/what-is-the-inverse-function-of-zip-in-python
#allDates, allTimesOfDay  = zip(*dates_and_timeOfDays) # we unzip the cartesian product so that we can now loop through everything
print('done')

155928
done


# Step 4. Perform Additional Checks, etc. before the different data are generated

In [7]:
assert(final_dataframe['TIME_OF_DAY'].dtype == np.dtype('int32') or final_dataframe['TIME_OF_DAY'].dtype == np.dtype('int64'))
assert(final_dataframe['DAY'].dtype == np.dtype('int64'))
assert(final_dataframe['MONTH'].dtype == np.dtype('int64'))
assert(final_dataframe['YEAR'].dtype == np.dtype('int64'))


########################################
#######################
bk_day_timeOfDay = copy.deepcopy(dates_and_timeOfDays)   # [datetime.date(2001, 1, 1), 0] to [datetime.date(2018, 10, 15), 23]


######################################################################################################################################
#https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [8]:
final_dataframe['categoryType'] =  pd.Categorical(final_dataframe['Primary Type'])
final_dataframe['categoryCode'] = final_dataframe['categoryType'].cat.codes            # df.cc.astype('category').cat.codes  https://stackoverflow.com/questions/38088652/pandas-convert-categories-to-numbers
# For faster filtering, remove some columns
all_cols = final_dataframe.columns.tolist()
all_cols.remove('ID')
all_cols.remove('Unnamed: 0')
all_cols.remove('Unnamed: 0.1')
all_cols.remove('Latitude') # have the pixelated values already
all_cols.remove('Longitude')
fDF = final_dataframe[all_cols].copy()

Load the masks that were generated in other python notebook. 

In [12]:
#load all the masks 
COMM_AREA_MASK = np.load(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\data_masks\commArea.npy'))
COMM_AREA_MASK_LOOKUP = pd.read_csv(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\data_masks\commAreaL.csv'))

BEATS_AREA_MASK = np.load(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\data_masks\beatArea.npy'))
BEATS_AREA_MASK_LOOKUP = pd.read_csv(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\data_masks\beatAreaL.csv'))

DISTRICT_AREA_MASK = np.load(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\data_masks\districtArea.npy'))
DISTRICT_AREA_MASK_LOOKUP = pd.read_csv(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\data_masks\districtAreaL.csv'))

WARD_AREA_MASK = np.load(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\data_masks\wardArea.npy'))
WARD_AREA_MASK_LOOKUP = pd.read_csv(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\data_masks\wardAreaL.csv'))


# Generate data starting with Time Related Data

In [None]:
# The time index 
#### dates [month, day, year, time of day]
dates_data = np.zeros(shape = (len(dates_and_timeOfDays), 4))
for x_ in range(len(dates_data)):
    dates_data_ele = np.array([dates_and_timeOfDays[x_][0].month, dates_and_timeOfDays[x_][0].day, dates_and_timeOfDays[x_][0].year, dates_and_timeOfDays[x_][1] ])
    #dates_data_ele = np.array([int(dates_and_timeOfDays[x_][0].month), int(dates_and_timeOfDays[x_][0].day), int(dates_and_timeOfDays[x_][0].year), int(dates_and_timeOfDays[x_][1]) ])
    dates_data[x_] = dates_data_ele
loc = r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\outputs_data 12 1 2018\dates_data.npy'
np.save(cleanURL(loc), dates_data)

# Generate Outputs (crime categories on 256 by 256 grid)

In [None]:
#Outputs
BATCHSIZE = 3000
listOfChunks = [x_ for x_ in chunks(bk_day_timeOfDay, BATCHSIZE)]
for _i_, chunk in enumerate(listOfChunks):
    dates_and_timeOfDays = chunk 
    _i_ = _i_+1
    print("Batch number : ", _i_, " Index in listOfchunks: ", (_i_-1))
    fileNameTosaveNPY = cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\outputs_data 12 1 2018\y__c' + str(_i_) + '_.npy')

    images_y = None  
    images_y = np.zeros(shape = (len(dates_and_timeOfDays), int(iHeight),int(iWidth)))

    dt = None
    tofD = None 
    zero_img = np.zeros(shape = (int(iHeight),int(iWidth)))
    row = None 
    col = None 
    

    print("started ", _i_,  datetime.datetime.now())
    for i, _ in enumerate(dates_and_timeOfDays):
        if(i%1000 == 0):
            print(i)
            print("In the middle", datetime.datetime.now())
        dt = _[0]
        tofD = _[1]
        df = fDF.loc[(tofD == fDF['TIME_OF_DAY']) & (dt.day == fDF['DAY']) & (dt.month == fDF['MONTH']) & (dt.year == fDF['YEAR'])]

        img = np.zeros((zero_img.shape[0], zero_img.shape[1]))
        #the lat/long base layer
        #img[0] = np.array(zero_img)
        if(len(df) != 0):
            row = df.latPixel.values
            col = df.longPixel.values
            img[row,col] = df.categoryCode.values #works like a charm
            #print('herew')
            #to check:  a = np.where(img ==1), a[0] , a[1]
        images_y[i] = img 

    np.save(fileNameTosaveNPY, images_y)
    
    print("finished ", _i_,  datetime.datetime.now())



# Generate Socioeconomic Data

Get the data ready, make assertions, initialize the image

In [None]:
#Socio Economic data 
socioeconomic_dataframe= pd.read_csv(cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\data generated checkpoint/socio_economic.csv'))

assert(socioeconomic_dataframe['Community Area'].dtype == np.dtype(np.int64))
assert(COMM_AREA_MASK_LOOKUP['value'].dtype == np.dtype(np.int64))
assert(COMM_AREA_MASK_LOOKUP['key'].dtype == np.dtype(np.int64))
assert(len(socioeconomic_dataframe) == 77)#that is the number of categories we have 


#print(set(COMM_AREA_MASK_LOOKUP['value'].tolist())  ^ set(socioeconomic_dataframe['Community Area'].tolist()))
#assert( set(COMM_AREA_MASK_LOOKUP['value'].tolist()) > set(socioeconomic_dataframe['Community Area'].tolist()))

#7 images, each one will have len(COMM_AREA_MASK) channels/masks and each channel/mask is 256,256
socio_economic_image = np.zeros(shape=(7, len(COMM_AREA_MASK), int(iHeight), int(iWidth))) #doing the other way, where I loop through what is in th socioeconomic_dataframe first is going to give same result since I put in 0 for a class that does not exist


In [None]:
#layer 0 'PERCENT OF HOUSING CROWDED'
#layer 1 'PERCENT HOUSEHOLDS BELOW POVERTY'
#layer 2 'PERCENT AGED 16+ UNEMPLOYED'
#layer 3 'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA'
#layer 4 'PERCENT AGED UNDER 18 OR OVER 64'
#layer 5 'PER CAPITA INCOME '
#layer 6 'HARDSHIP INDEX'           # seven layers in total   to normalize -> normalize on that layer 

for socioImgIndex in range(len(COMM_AREA_MASK)):
    if(socioImgIndex == 0):
        socio_economic_image[:, 0] = COMM_AREA_MASK[0]  # all 7 layers should have this 
        continue

    #if the other one we need to do a look up 
   
    realCommunityAreaNumber = COMM_AREA_MASK_LOOKUP.loc[(COMM_AREA_MASK_LOOKUP['key']  == socioImgIndex), 'value'].values[0]

    try:
        # do a look up in the socioeconomic_dataframe 
        socio_economic_image[0, socioImgIndex] = COMM_AREA_MASK[socioImgIndex]*(socioeconomic_dataframe.loc[(socioeconomic_dataframe['Community Area']  == realCommunityAreaNumber), 'PERCENT OF HOUSING CROWDED'].values[0])
        socio_economic_image[1, socioImgIndex] = COMM_AREA_MASK[socioImgIndex]*(socioeconomic_dataframe.loc[(socioeconomic_dataframe['Community Area']  == realCommunityAreaNumber), 'PERCENT HOUSEHOLDS BELOW POVERTY'].values[0])
        socio_economic_image[2, socioImgIndex] = COMM_AREA_MASK[socioImgIndex]*(socioeconomic_dataframe.loc[(socioeconomic_dataframe['Community Area']  == realCommunityAreaNumber), 'PERCENT AGED 16+ UNEMPLOYED'].values[0])
        socio_economic_image[3, socioImgIndex] = COMM_AREA_MASK[socioImgIndex]*(socioeconomic_dataframe.loc[(socioeconomic_dataframe['Community Area']  == realCommunityAreaNumber), 'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA'].values[0])
        socio_economic_image[4, socioImgIndex] = COMM_AREA_MASK[socioImgIndex]*(socioeconomic_dataframe.loc[(socioeconomic_dataframe['Community Area']  == realCommunityAreaNumber), 'PERCENT AGED UNDER 18 OR OVER 64'].values[0])
        socio_economic_image[5, socioImgIndex] = COMM_AREA_MASK[socioImgIndex]*(socioeconomic_dataframe.loc[(socioeconomic_dataframe['Community Area']  == realCommunityAreaNumber), 'PER CAPITA INCOME '].values[0])
        socio_economic_image[6, socioImgIndex] = COMM_AREA_MASK[socioImgIndex]*(socioeconomic_dataframe.loc[(socioeconomic_dataframe['Community Area']  == realCommunityAreaNumber), 'HARDSHIP INDEX'].values[0])
    except:
        continue # if we don't find the community area in the dataframe, we don't have the data ,just keep zeros 

# done for loop, sum up all seven images
socio_economic_image_result = np.zeros(shape=(7, int(iHeight), int(iWidth)))
for ind in range(len(socio_economic_image_result)):
    socio_economic_image_result[ind] = np.sum(socio_economic_image[ind], axis = 0)


fileNameTosaveNPY_socio = cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\outputs_data 12 1 2018\x__socio_.npy')
np.save(fileNameTosaveNPY_socio, socio_economic_image_result)


# Generate Businesses Data

Get data. Initialize the dataset

In [None]:
businessesLoc = cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\Businesses.xlsx')
businesses = getDF(loc = businessesLoc, sheetname = 'Sheet1')
printNulls(businesses)  # make sure there are no na values 

businessesImg = np.zeros(shape=(5, int(iHeight), int(iWidth)))

Clean up the data

In [None]:
#below we are placing the columns in the correct order 
businesses = businesses[['Start Date', 'End Date', 'Location', 'Food Service', 'Tobacco Sale', 'Alcohol Consumption', 'Package Store', 'Gas Station']].copy()
businesses['Location']=  businesses['Location'].str[1:-1]
businesses[['lat', 'long']] = businesses['Location'].str.split(', ', expand = True)
businesses['long'] = businesses['long'].astype(np.float64)
businesses['lat'] = businesses['lat'].astype(np.float64)

## filter out anything that is outside latmin2, latmax2, longmin2, longmax2
businesses = businesses.loc[(businesses['lat'] < latMax2) & (businesses['lat'] > latMin2)].copy()
businesses = businesses.loc[(businesses['long'] < longMax2) & (businesses['long'] > longMin2)].copy()


#convert the latitude and longitude values to the pixel values
businesses['latPixel'] = (businesses.lat - latMin2)*latMultiplier
businesses['longPixel'] =(businesses.long - longMin2)*longMultiplier

#pixel values are floats and not integers. type cast
businesses['latPixel'] = businesses['latPixel'].astype(np.int64)
businesses['longPixel'] = businesses['longPixel'].astype(np.int64)

businesses['latPixel'] = iHeight - businesses['latPixel'] - 1  # this is important because matrix row numbers

#clip any values greater the iHeight and iWeight
businesses['latPixel'] = businesses['latPixel'].astype(np.int64)
businesses['longPixel'] = businesses['longPixel'].astype(np.int64)
businesses.loc[(businesses['latPixel'] >= int(iHeight)), 'latPixel'] = int(iHeight) - 1
businesses.loc[(businesses['longPixel'] >= int(iWidth)), 'longPixel'] = int(iWidth) - 1
businesses.loc[(businesses['latPixel'] <= 0), 'latPixel'] = 0
businesses.loc[(businesses['longPixel'] <= 0), 'longPixel'] = 0


#check how many were set to 255 
len(np.where(businesses.longPixel == 255)[0])
len(np.where(businesses.latPixel == 255)[0])
len(np.where(businesses.longPixel == 0)[0])
len(np.where(businesses.latPixel == 0)[0])


Execute loop

In [None]:
#layer 0  'Food Service'
#layer 1  'Tobacco Sale'
#layer 2   'Alcohol Consumption'
#layer 3   'Package Store'
#layer 4   'Gas Station'
businesses = businesses[['Start Date', 'End Date', 'Location', 'Food Service', 'Tobacco Sale', 'Alcohol Consumption', 'Package Store', 'Gas Station', 'lat', 'long', 'latPixel', 'longPixel']].copy()
cols = businesses.columns.tolist()
latPixelIndex = cols.index('latPixel')
longPixelIndex = cols.index('longPixel')

fsPixelIndex = cols.index('Food Service')
tsPixelIndex = cols.index('Tobacco Sale')
acPixelIndex = cols.index('Alcohol Consumption')
psPixelIndex = cols.index('Package Store')
gsPixelIndex = cols.index('Gas Station')

#layer 0  'Food Service'
#layer 1  'Tobacco Sale'
#layer 2   'Alcohol Consumption'
#layer 3   'Package Store'
#layer 4   'Gas Station'

#assert(businesses['Location'].dtype ==np.dtype('o'))
for rowIndex in range(len(businesses)):
    #convert the point
    pointLat = businesses.iat[rowIndex, latPixelIndex]
    pointLong =  businesses.iat[rowIndex, longPixelIndex]
    
    if( businesses.iat[rowIndex, fsPixelIndex] == True):
        businessesImg[0, pointLat , pointLong] = 1
        continue

    if( businesses.iat[rowIndex, tsPixelIndex] == True):
        businessesImg[1, pointLat , pointLong] = 1
        continue

    if( businesses.iat[rowIndex, acPixelIndex] == True):
        businessesImg[2, pointLat , pointLong] = 1
        continue

    if( businesses.iat[rowIndex, psPixelIndex] == True):
        businessesImg[3, pointLat , pointLong] = 1
        continue

    if( businesses.iat[rowIndex, gsPixelIndex] == True):
        businessesImg[4, pointLat , pointLong] = 1
        continue

fileNameTosaveNPY_businesses = cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\outputs_data 12 1 2018\x__businesses_.npy')
np.save(fileNameTosaveNPY_businesses, businessesImg)


# Generate Buildings Data

Clean up dataframe, perform checks

In [None]:
#Buildings 
buildingsLoc = cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\BuildingsParsed.csv')
buildings= pd.read_csv(buildingsLoc)
assert(buildings.lat.dtype == np.dtype('float64'))
assert(buildings.long.dtype == np.dtype('float64'))

#buildings = getDF(loc = businessesLoc, sheetname = 'Sheet1')
printNulls(buildings)  # make sure there are no na values 

buildings = buildings.loc[(buildings['lat'] < latMax2) & (buildings['lat'] > latMin2)].copy()
buildings = buildings.loc[(buildings['long'] < longMax2) & (buildings['long'] > longMin2)].copy()


#convert the latitude and longitude values to the pixel values
buildings['latPixel'] = (buildings.lat - latMin2)*latMultiplier
buildings['longPixel'] =(buildings.long - longMin2)*longMultiplier

#pixel values are floats and not integers. type cast
buildings['latPixel'] = buildings['latPixel'].astype(np.int64)
buildings['longPixel'] = buildings['longPixel'].astype(np.int64)

buildings['latPixel'] = iHeight - buildings['latPixel'] - 1  # this is important because matrix row numbers

#clip any values greater the iHeight and iWeight
buildings['latPixel'] = buildings['latPixel'].astype(np.int64)
buildings['longPixel'] = buildings['longPixel'].astype(np.int64)
buildings.loc[(buildings['latPixel'] >= int(iHeight)), 'latPixel'] = int(iHeight) - 1
buildings.loc[(buildings['longPixel'] >= int(iWidth)), 'longPixel'] = int(iWidth) - 1
buildings.loc[(buildings['latPixel'] <= 0), 'latPixel'] = 0
buildings.loc[(buildings['longPixel'] <= 0), 'longPixel'] = 0


#check how many were set to 255 
len(np.where(buildings.longPixel == 255)[0])  #9089
len(np.where(buildings.latPixel == 255)[0])  # 4480 rows, have to make sure if we want this 
len(np.where(buildings.longPixel == 0)[0])  #9089
len(np.where(buildings.latPixel == 0)[0])  #



Perform additional checks, make the "uninhabitable" columns the same, Initialize the data image

In [None]:
#remove extra column
buildings.loc[(buildings['Condition'] == 'UNNHABITABLE'), 'Condition'] = 'UNINHABITABLE' 

assert(buildings['Stories'].dtype ==np.dtype('int64'))
assert(buildings['Units'].dtype ==np.dtype('int64'))
assert(buildings['Square Footage'].dtype ==np.dtype('int64'))

#intialize the data 
buildingsImg = np.zeros(shape=(10, int(iHeight), int(iWidth)))

Generate the data

In [None]:
#layer 0  'UNINHABITABLE' or 'UNNHABITABLE'
#layer 1  'SOUND' , Stories
#layer 2  'SOUND' , units
#layer 3  'SOUND' , sq footage
#layer 4  'NEEDS MINOR REPAIR' , Stories
#layer 5  'NEEDS MINOR REPAIR' , units
#layer 6  'NEEDS MINOR REPAIR' , sq footage
#layer 7  'NEEDS MAJOR REPAIR' , Stories
#layer 8  'NEEDS MAJOR REPAIR' , units
#layer 9  'NEEDS MAJOR REPAIR' , sq footage


#layer 0
l0 = buildings.loc[(buildings['Condition'] == 'UNINHABITABLE')].copy()
rowVals = l0.latPixel.values
colVals = l0.longPixel.values
buildingsImg[0][rowVals, colVals] = 1

#layers 1 -3
l1 = buildings.loc[(buildings['Condition'] == 'SOUND')].copy()
rowVals = l1.latPixel.values
colVals = l1.longPixel.values
buildingsImg[1][rowVals, colVals] = l1.Stories.values
buildingsImg[2][rowVals, colVals] = l1.Units.values
buildingsImg[3][rowVals, colVals] = l1['Square Footage'].values

#layers 4 -6
l2 = buildings.loc[(buildings['Condition'] == 'NEEDS MINOR REPAIR')].copy()
rowVals = l2.latPixel.values
colVals = l2.longPixel.values
buildingsImg[4][rowVals, colVals] = l2.Stories.values
buildingsImg[5][rowVals, colVals] = l2.Units.values
buildingsImg[6][rowVals, colVals] = l2['Square Footage'].values

#layers 7 - 9
l3 = buildings.loc[(buildings['Condition'] == 'NEEDS MAJOR REPAIR')].copy()
rowVals = l3.latPixel.values
colVals = l3.longPixel.values
buildingsImg[7][rowVals, colVals] = l3.Stories.values
buildingsImg[8][rowVals, colVals] = l3.Units.values
buildingsImg[9][rowVals, colVals] = l3['Square Footage'].values


fileNameTosaveNPY_buildings= cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\outputs_data 12 1 2018\x__buildings_.npy')
np.save(fileNameTosaveNPY_buildings, buildingsImg)


# Generate L Entries Data 

Ingest data, clean and perform checks

In [None]:
# L- entries 
LentriesLoc = cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\LEntriesParsed.csv')
Lentries= pd.read_csv(LentriesLoc)
assert(Lentries.lat.dtype == np.dtype('float64'))
assert(Lentries.long.dtype == np.dtype('float64'))

#the preprocessing required Kludge: probably make this a function 

printNulls(Lentries)  # make sure there are no na values 

#filter out values that don't belong to our rectangles 
Lentries = Lentries.loc[(Lentries['lat'] < latMax2) & (Lentries['lat'] > latMin2)].copy()
Lentries = Lentries.loc[(Lentries['long'] < longMax2) & (Lentries['long'] > longMin2)].copy()


#convert the latitude and longitude values to the pixel values
Lentries['latPixel'] = (Lentries.lat - latMin2)*latMultiplier
Lentries['longPixel'] =(Lentries.long - longMin2)*longMultiplier

#pixel values are floats and not integers. type cast
Lentries['latPixel'] = Lentries['latPixel'].astype(np.int64)
Lentries['longPixel'] = Lentries['longPixel'].astype(np.int64)

Lentries['latPixel'] = iHeight - Lentries['latPixel'] - 1  # this is important because matrix row numbers

#clip any values greater the iHeight and iWeight
Lentries['latPixel'] = Lentries['latPixel'].astype(np.int64)
Lentries['longPixel'] = Lentries['longPixel'].astype(np.int64)
Lentries.loc[(Lentries['latPixel'] >= int(iHeight)), 'latPixel'] = int(iHeight) - 1
Lentries.loc[(Lentries['longPixel'] >= int(iWidth)), 'longPixel'] = int(iWidth) - 1
Lentries.loc[(Lentries['latPixel'] <= 0), 'latPixel'] = 0
Lentries.loc[(Lentries['longPixel'] <= 0), 'longPixel'] = 0


#check how many were set to 255 
len(np.where(Lentries.longPixel == 255)[0])  #9089
len(np.where(Lentries.latPixel == 255)[0])  # 4480 rows, have to make sure if we want this 
len(np.where(Lentries.longPixel == 0)[0])  #9089
len(np.where(Lentries.latPixel == 0)[0])  #

# Convert Date column to right format, get the first and the last days of the dataset 

In [None]:
#chaange the date times 
Lentries['Date'] = pd.to_datetime(Lentries['Date'], format ='%m/%d/%Y') 


#get the first and the last day from below
firstDay = bk_day_timeOfDay[0][0]
lastDay = bk_day_timeOfDay[-1][0]
delta = lastDay - firstDay
daysdelta = delta.days
#lastDay-Lentries.Date.max().date() --> difference of 107 days => we could shave that off if we'd like, I am going to keep empty images 

#initialize the data image
LEntriesImg = np.zeros(shape = (daysdelta, 8, int(iHeight), int(iWidth)) , dtype=numpy.int16)


The data loop to get LEntries data

In [None]:
#layer 0 - Green Line
#layer 1 - Red Line
#layer 2 - Brown
#layer 3 - Purple
#layer 4 - Yellow
#layer 5 - Blue
#layer 6 - Orange
#layer 7 - pink 
intermediateDf = None 


for day in range(daysdelta):
    if(day%1000==0):
        print(day)
    date_ = firstDay+datetime.timedelta(day)
    
    intermediateDf = Lentries.loc[(Lentries['Date'].dt.day == date_.day) & (Lentries['Date'].dt.month ==date_.month)  &(Lentries['Date'].dt.year ==date_.year) ].copy()
    green = intermediateDf.loc[intermediateDf['Green Line']==True]
    red = intermediateDf.loc[intermediateDf['Red Line']==True]
    brown= intermediateDf.loc[intermediateDf['Brown Line']==True]
    purple= intermediateDf.loc[intermediateDf['Purple Line']==True]
    yellow= intermediateDf.loc[intermediateDf['Yellow Line']==True]
    blue= intermediateDf.loc[intermediateDf['Blue Line']==True]
    orange= intermediateDf.loc[intermediateDf['Orange Line']==True]
    pink= intermediateDf.loc[intermediateDf['Pink Line']==True]
    
    #filter on the day and then filter on the 
    LEntriesImg[day,0, green.latPixel.values , green.longPixel.values] = green.Entries.values
    LEntriesImg[day,1, red.latPixel.values , red.longPixel.values] = red.Entries.values  
    LEntriesImg[day,2, brown.latPixel.values , brown.longPixel.values] = brown.Entries.values  
    LEntriesImg[day,3, purple.latPixel.values , purple.longPixel.values] = purple.Entries.values  
    LEntriesImg[day,4, yellow.latPixel.values , yellow.longPixel.values] = yellow.Entries.values  
    LEntriesImg[day,5, blue.latPixel.values , blue.longPixel.values] = blue.Entries.values  
    LEntriesImg[day,6, orange.latPixel.values , orange.longPixel.values] = orange.Entries.values  
    LEntriesImg[day,7, pink.latPixel.values , pink.longPixel.values] = pink.Entries.values  
  



fileNameTosaveNPY_L_entries= cleanURL(r'C:\Users\j70514\Documents\Data Science Stuff\DeepLearning_cs230\CNN_data_crunch\rev2_ 11 30 2018\outputs_data 12 1 2018\x__Lentries_2.npy')
np.save(fileNameTosaveNPY_L_entries, LEntriesImg)

# Generate Waterways Data

In [9]:
import shapefile  #conda install -c conda-forge pyshp    # (version should be 2.0)
from shapely.geometry import Point   #conda install -c conda-forge shapely
from shapely.geometry import shape
import shapely.wkt

In [10]:
waterway = pd.read_csv(cleanURL(r'C:\Users\User\Documents\CS230 Project\new_github\WaterwaysCSVlast.csv'))
polygonIndex = waterway.columns.tolist().index('Outline')
print(waterway.columns.tolist())
print(polygonIndex)

waterway.head()
# waterway.loc[waterway.index <43]

['Unnamed: 0', 'Outline']
1


Unnamed: 0.1,Unnamed: 0,Outline
0,0,MULTIPOLYGON (((-87.69979372946463 41.84283213...
1,1,MULTIPOLYGON (((-87.67565819832085 41.84189684...
2,2,MULTIPOLYGON (((-87.67531615296008 41.84125308...
3,3,MULTIPOLYGON (((-87.67216765378741 41.84140721...
4,4,MULTIPOLYGON (((-87.66433144398182 41.84009604...


In [11]:
printNulls(waterway)
# waterway.iat[47,1]

Series([], dtype: float64)

In [12]:
point_ =  waterway.iat[0,1]
# print(point_)
P = shapely.wkt.loads(point_)
# print(type(P))


In [34]:
badIndices=[]
for rIndex in range(len(waterway)):
    stringMultipolygon = waterway.iat[rIndex,polygonIndex]
    multiPolygon = None 
    try:
        multiPolygon = shapely.wkt.loads(stringMultipolygon)
    except:
        print('polygon not parsed correctly', rIndex)
        badIndices.append(rIndex)
        continue
    polygons = list(multiPolygon)
    try:
        assert(len(polygons)==1)
    except:
        badIndices.append(rIndex)
        print('more than one polygon found : ', rIndex)
print('done')

more than one polygon found :  290
done


In [35]:
badIndices

[290]

In [36]:
#remove any bad indices
waterwayCleaned = waterway.loc[~waterway.index.isin(badIndices)].copy()
waterwayCleaned.head()
len(waterwayCleaned)

waterPolygons = []
for rIndex in range(len(waterwayCleaned)):
    stringMultipolygon = waterwayCleaned.iat[rIndex,polygonIndex]
    multiPolygon = shapely.wkt.loads(stringMultipolygon)
    polygons = list(multiPolygon)
    waterPolygon = polygons[0]
    waterPolygons.append(waterPolygon)


In [37]:
P.within(shape(waterPolygons[0]))
iHeight = 256
iWidth = 256

In [38]:
waterwayMask = np.zeros(shape=(int(iHeight), int(iWidth)))


for row in range(int(iHeight)):
    for col in range (int(iWidth)):

        colMid = col+.5 # make the latitude and longitude be in the middle of the pixel
        rowMid = iHeight - row - 1 +.5 # double check for this to work 
        #rowMid = row +.5

        longVal = (colMid-0.)*(1./longMultiplier)+longMin2
        latVal = (rowMid -0.)*(1./latMultiplier)+latMin2
        point_ = Point((longVal, latVal))

        inWaterArea = False
        #plug in latVal and longVal
        for wPolygon in waterPolygons:
            if(point_.within(shape(wPolygon))):  # Point(point).within(shape(shape_boundary))
                inWaterArea = True
                break
                
        if(inWaterArea):
            waterwayMask[row, col] = 1

In [None]:
waterway = np.load(cleanURL(r'C:\Users\User\Documents\CS230 Project\new_github\waterway.npy'))



waterPolygons = []
for rIndex in range(len(waterwayCleaned)):
    stringMultipolygon = waterwayCleaned.iat[rIndex,polygonIndex]
    multiPolygon = shapely.wkt.loads(stringMultipolygon)
    polygons = list(multiPolygon)
    waterPolygon = polygons[0]
    waterPolygons.append(waterPolygon)



In [39]:
np.save(cleanURL(r'C:\Users\User\Documents\CS230 Project\new_github\waterway_12_6_2018.npy') , waterwayMask)

In [40]:
print('done')

done
