In [2]:
import import_ipynb
from readdrivingdata import read_driving_data
from readdrivingdata import read_NC_driving_data

pdata = read_driving_data()
pdata_NC = read_NC_driving_data()

In [3]:
import skmob
from skmob.measures.individual import home_location
import pandas as pd
import numpy as np
from geopy.geocoders import GoogleV3

geolocator = GoogleV3(api_key="xxxx")

In [4]:
def homelocfinder(data, lat_col, long_col, date_col, id_col, rounding = None):
    '''
    This function takes the input of a driving dataset with latitude, longitude, datetime, and user_id columns, and uses the 
    skmob home_location function to find each individual's home coordinates (latitude and longitude) as the most common location
    visited or returned to at night. It also gives the option of rounding latitude and longitude values before passing them
    to the internal skmob function, which is intended to enable the function to maximize the amount of home locations it
    can find when dealing with data that involves a large number of data points (e.g. GPS data).

    Arguments required are:

    data (dataframe) = driving dataset, with four columns containing latitude, longitude, datetime, and user id, respectively
    lat_col (str) =  name of the column in "data" that contains latitude values
    long_col (str) =  name of the column in "data" that contains longitude values
    date_col (str) =  name of the column in "data" that contains date-time values
    id_col (str) =  name of the column in "data" that contains participant identification codes
    rounding (int, optional) = number of decimal points to round latitude and longitude values in "data" to

    Other requirements include the skmob library and environment.

    '''

    if rounding is not None:
        data[lat_col] = np.round(data[lat_col], decimals = rounding)
        data[long_col] = np.round(data[long_col], decimals = rounding)
        
    tdf = skmob.TrajDataFrame(data, latitude = lat_col, longitude = long_col, datetime = date_col, user_id = id_col)
    
    return home_location(tdf)

    
def homeaddressfinder(hloc_df):
    '''
    This function takes the input of a driving dataset with user_id, home latitude, and home longitude columns (in that order).
    It uses Geopy's GoogleV3 API to determine home addresses (col 3) from the determined lat and long coordinates and add them
    to the provided dataframe.

    Arguments required are:
    hloc_df (dataframe) = dataset with three columns containing user id, home latitude, and home longitude, respectively... 
                            e.g. the dataframe returned from homelocfinder

    Other requirements include the GoogleV3 library from geopy.geocoders.

    '''

    hloc_df['address'] = ''

    for i in range(len(hloc_df)):
        try:
            lat = hloc_df.iloc[i, 1]
            long = hloc_df.iloc[i, 2]
            hloc_df.iloc[i, 3] = str(geolocator.reverse(f"{lat}, {long}"))
        finally:
            pass
    
    return hloc_df


def homeaddressfiller(hloc_df, hloc_df2):
    '''
    This function works similarly to homeaddressfinder, however it is meant to FILL hloc_df if home coordinates/
    addresses are missing for some individuals. It takes the input of two dataframes with home locations and addresses,
    where the first one (hloc_df) is more desirable, i.e. more precise/accurate, but is missing some rows of location 
    information, and the second one (hloc_df2) is less desirable but used to fill in the missing rows in hloc_df. For
    this function to work, both dataframes must be identical in format (e.g. same number of rows and same uid's in each row).

    Arguments required are:
    hloc_df (dataframe) = dataset with four columns containing user id, home latitude, home longitude, and home 
    addresses, respectively... e.g. a dataframe returned from homeaddressfinder BUT missing some home locations
    hloc_df2 (dataframe) = dataset with three columns containing user id, home latitude, and home longitude, 
    respectively... e.g. a dataframe returned from homelocfinder BUT with no home locations missing

    Other requirements include the GoogleV3 library from geopy.geocoders.

    '''
    participants = hloc_df[hloc_df['lat'] == np.float64(0)] ## dataframe of participants with missing home locations

    for uid in participants['uid']:
        i = hloc_df2[hloc_df2['uid'] == np.float64(uid)].index.values[0]  ## finds index of participants with missing home locations

        lat = hloc_df2.iloc[i, 1]
        long = hloc_df2.iloc[i, 2]
        
        hloc_df.iloc[i, 1] = lat    ## places new home lat/long/addresses
        hloc_df.iloc[i, 2] = long      ##  in original hloc_df dataframe
        hloc_df.iloc[i, 3] = str(geolocator.reverse(f"{lat}, {long}"))
    
    return hloc_df

In [5]:
#### To find home locations for participants from the driving/GPS data, we will first round the latitude/longitude values
### to minimize errors that may occur due to slight differences in later decimal points of measured latitudes/longitudes.
## The fifth decimal place corresponds to approxiamtely 1.1 m, which should be more than enough to accurately locate 
# a residence.

### using skmob function to determine home locations (lat/long)
hloc_df = homelocfinder(pdata, 'TELat', 'TELong', 'TEtime', 'uid', 5)

### using Google API to determine home addresses (from lat/long)
hloc_df = homeaddressfinder(hloc_df)

100%|██████████| 246/246 [00:01<00:00, 150.07it/s]


In [6]:
### For the participants that the function was unable to find a home location for, we will try creating another
## version of hloc_df using less precise latitude and longitude values (to four decimal places, corresponding to 
# approximately 11 m) and running homeaddressfiller.

### using skmob function to determine home locations (lat/long)... easiest to just repeat for all participants
hloc_df2 = homelocfinder(pdata, 'TELat', 'TELong', 'TEtime', 'uid', 4)

### using Google API to determine home addresses (from lat/long)
hloc_df = homeaddressfiller(hloc_df,  hloc_df2)


#### ultimately, hloc_df and hloc_df2 have essentially the same addresses; however, this method allows us to use
#### the more precise latitude/longitude values for most participants (since 11m may lead to some inaccuracies in
#### in determining exact home address) while also locating approximate home addresses for any participants that 
#### the function was not intially able to find home addresses for


100%|██████████| 246/246 [00:01<00:00, 169.84it/s]


In [7]:
### using skmob function to determine home locations (lat/long) for NC participants; note we are processing
## this data seperately from the main dataset b/c some discrepancies occurred when both were combined first...
# perhaps because pdata has date+time and pdata_NC has only date; also, no rounding seems to be needed to find all addresses

hloc_df_NC = homelocfinder(pdata_NC,'tripEndLat', 'tripEndLon', 'date', 'id')

### using Google API to determine home addresses (from lat/long)

hloc_df_NC = homeaddressfinder(hloc_df_NC)


100%|██████████| 39/39 [00:00<00:00, 449.66it/s]


In [8]:
hloc_full = pd.concat([hloc_df, hloc_df_NC])
hloc_full.reset_index(inplace = True, drop = True)

hloc_full.to_csv(r'C:\Users\maria\OneDrive\Documents\RESEARCH\Data files\participant home addresses.csv', index=False)