This file is used to reproduce Section 3.2.2 and 3.2.3 in the paper as well as Table 1, Table 8 and 9.

Acadia National Park is used as an example. For Yosemite National Park, replace the input and position url in Chunk [4].

### Extracting trip sequences from geotagged photos

In [1]:
!pip install -r requirements.txt



In [2]:
import pandas as pd
import numpy as np
import string

In [3]:
# to reproduce Table 1
acadia_ttl = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP.csv"
yosemite_ttl = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/yosemite_NP.csv"

def table_stats(input):
    df = pd.read_csv(input)
    total_pts = df.shape[0] # number of photos
    total_user = df['owner'].nunique() #number of users
    print("Num_of_photos:",total_pts, ", Num_of_users:", total_user)

#table_stats(acadia_ttl)
table_stats(yosemite_ttl)

Num_of_photos: 50384 , Num_of_users: 3653


In [4]:
# INPUT
# data retrieved using Flickr API after clustering, each photo is assigned with a cluster_id
acadia_url = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP_cluster.csv"
yosemite_url = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/yosemite_NP_cluster.csv"

# position of each attraction in park
acadia_position = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP_coords.csv"
yosemite_position = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/yosemite_NP_coords.csv"


#input_url = acadia_url 
input_url = yosemite_url

#position_url = acadia_position
position_url = yosemite_position

In [5]:
# The input csv is the data retrieved using Flickr API after clustering
# each photo is assigned with a cluster_id

df = pd.read_csv(input_url)

# data pre-processing
df['datetaken'] = pd.to_datetime(df['datetaken'])
df['date'] = [d.date() for d in df['datetaken']]
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['time'] = [d.time() for d in df['datetaken']]
df.Cluster = [chr(ord('a') + x) for x in df.Cluster ]
df.head()

Unnamed: 0,index,id,owner,datetaken,latitude,longitude,title,accuracy,views,Cluster,date,year,month,time
0,6,38784602561,64964567@N00,2010-01-29 11:58:50,37.744147,-119.589889,Winter: Yosemite National Park: Yosemite Fall...,16.0,457,u,2010-01-29,2010,1,11:58:50
1,7,38067175464,64964567@N00,2010-01-29 12:07:45,37.743444,-119.589681,Winter: Yosemite National Park,16.0,376,u,2010-01-29,2010,1,12:07:45
2,8,38067167694,64964567@N00,2010-01-29 13:30:22,37.744572,-119.584432,Winter: Yosemite National Park - Mule Deer Buck,16.0,507,s,2010-01-29,2010,1,13:30:22
3,9,23919352207,64964567@N00,2010-01-29 14:23:14,37.750008,-119.595631,Winter: Yosemite National Park,16.0,347,t,2010-01-29,2010,1,14:23:14
4,10,38783821091,64964567@N00,2010-01-29 14:25:10,37.749791,-119.595867,Winter: Yosemite National Park - Yosemite Fal...,16.0,489,t,2010-01-29,2010,1,14:25:10


In [6]:
from datetime import timedelta
import collections

# preprocess: sort values and set owners as index
def clean_data(input):
    input = input[['owner','year','month','date','Cluster']]
    input.sort_values(['owner', 'year','month','date'], ascending=True, inplace=True)
    input.set_index(['owner'],inplace=True)
    return input


# construct trips by getting temporally-ordered sequence of photo
# a time threshold of 4 days is used to distinguish separate trips from the same user
def split_trip(input):
    delta, trip_id = [],[]
    counts = collections.Counter(input.index)
    unique_users_list = input.index.unique()
    
    for i in range(len(unique_users_list)):
        user = unique_users_list[i]
        subset = input.loc[user]

        if counts[user] == 1:
            delta.append(0)
            trip_id.append(1)
        else:
            delta.append(0)
            trip_id.append(1)
            for j in range(counts[user]-1):
                length = subset['date'].values[j+1] - subset['date'].values[j]
                delta.append(length.days)
                if length.days > 4: #time threshold: average length of stay in both NPs
                    trip_id.append(trip_id[-1] + 1)
                else:
                    trip_id.append(trip_id[-1])
    input.loc[:,'duration'] = delta
    input.loc[:,'trip_id'] = trip_id
    return input

# form dataframe for constructed trips, i.e., od: [a,a,a,b,c]
def get_OD_trips(input):
    trips = pd.DataFrame()
    user,year,month, trip_id, od = [],[],[],[],[]
    index = input.index.unique()   
    for i in range (len(index)):
        user.append(index[i][0])
        month.append(input.loc[index[i]]['month'].values[0])
        trip_id.append(index[i][1])
        od.append(input.loc[index[i]]['Cluster'].values)   
    trips['user'] = user
    trips['month'] = month
    trips['trip_id'] = trip_id
    trips['od'] = od   
    return trips

# removing consecutive duplicates from the od list, i.e., u_od: [a,b,c]
from itertools import groupby  
def get_unique_OD_trips(input):
    input['u_od'] = ""
    for i in range(len(input)): 
        res = [i[0] for i in groupby(input['od'][i])] 
        input['u_od'][i] = res
    return input

In [7]:
# remove duplicate within same owner, same date, same place id
OD = df.drop_duplicates(['owner','year','date','Cluster'],keep='first')

full = clean_data(OD)
full = split_trip(full)
full.set_index('trip_id',append=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [8]:
NP_trips = get_OD_trips(full)
NP_trips = get_unique_OD_trips(NP_trips)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Calculating visiting probabilities from trip sequences

In [9]:
# get trip segments from sequences i.e., [a,b] and [b,c] from [a,b,c]
from itertools import permutations

num_cluster = df.Cluster.nunique()
alphabet_string = string.ascii_lowercase
alphabet_list = list(alphabet_string)[:num_cluster]
key  = alphabet_list
output = sum([list(map(list, permutations(key, 2)))], [])
outstr = [', '.join(output[i]) for i in range(len(output))]
for i in range(len(key)):
    outstr.insert(i*(len(key)+1), key[i])

# number of total in each cluster
photos_num = [df[df.Cluster == x].shape[0] for x in alphabet_list]

# count number of trips from the segments
def get_flow(df, keys):
    dicts = dict.fromkeys(keys, 0) 
    for i in range(len(df['u_od'])):
        values = df['u_od'].iloc[i]
        for j in range(len(keys)):
            dicts[keys[j]] += values.count(keys[j])
    res = pd.DataFrame.from_dict(dicts, orient='index')
    return res

In [10]:
# construct flow matrix based on trip segments
# calculate number of incoming, outgoing and cross_boundary trips
def flow_matrix(df):
    dim = num_cluster
    flow_matrix = np.zeros((dim, dim), int)
    for i in range(dim):
        for j in range(dim):
            if i == j:
                flow_matrix[i][j] = 0
            else:
                flow_matrix[i][j] = df.values[j+dim*i]
    res = pd.DataFrame(data=flow_matrix, columns = alphabet_list)    
    res['total_out'] = res.sum(axis=1)
    res['total_in']= res.sum(axis=0)[:dim].values
    res['cross_boundary'] = res.loc[:,'total_out'].values+res.loc[:,'total_in'].values
    res['photos'] = photos_num
    res['Places'] = position['Clusters from Data'].values # get cluster names
    res = res.set_index('Places')
    return res


# calculate visiting probabilities from the flow matrix
def prob_matrix(df):
    pmatrix = df.iloc[:,:df.shape[0]].div(df.total_out, axis=0)
    pmatrix = pmatrix.fillna(0)
    pmatrix_df = pd.DataFrame(pmatrix, columns = alphabet_list)
    pmatrix_df['Places'] = position['Clusters from Data'].values
    pmatrix_df = pmatrix_df.set_index('Places')
    return pmatrix_df


# take subset of data by month
def subset_data(input,month):
    subset = input[input['month'] == month]
    return subset


# split the flow matrix into month
def split_fmatrix(trips, month):
    subset = subset_data(trips, month)
    flow = get_flow(subset,outstr) 
    return flow_matrix(flow)

In [11]:
NP_trips['u_od'] = [', '.join(x) for x in NP_trips['u_od']]
NP_trips

Unnamed: 0,user,month,trip_id,od,u_od
0,100000053@N05,3,1,"[m, q, u, s, t, l]","m, q, u, s, t, l"
1,100103417@N06,8,1,[p],p
2,10016118@N04,12,1,"[n, u]","n, u"
3,100173096@N06,5,1,[i],i
4,100184521@N08,12,1,[n],n
...,...,...,...,...,...
3421,99731606@N03,10,1,"[g, d]","g, d"
3422,9987846@N08,10,1,[l],l
3423,99949513@N00,10,1,[o],o
3424,99949513@N00,4,2,"[u, t, j, l]","u, t, j, l"


In [12]:
position = pd.read_csv(position_url)
position['coord'] = list(zip(position.Longitude, position.Latitude))

# flow matrix of all trips in the National Park
# column['total_out','total_in','photos'] to reproduce Table 8 and 9
flow = get_flow(NP_trips,outstr)
fmatrix = flow_matrix(flow)
fmatrix

Unnamed: 0_level_0,a,b,c,d,e,f,g,h,i,j,...,p,q,r,s,t,u,total_out,total_in,cross_boundary,photos
Places,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mariposa Grove of Giant Sequoias,0,1,1,3,10,1,4,5,1,7,...,2,5,1,2,5,6,135,135,270,1787
Tioga Lake,2,0,0,45,1,18,12,8,0,1,...,0,1,0,0,0,4,111,111,222,1054
Tuolumne Grove,0,1,0,1,0,7,2,12,0,4,...,3,1,0,6,2,0,65,53,118,555
Tuolumne Meadows,5,37,7,0,0,33,31,3,1,7,...,2,3,1,1,0,1,151,165,316,1630
Yosemite West,5,2,1,0,0,2,1,1,1,1,...,0,2,2,2,0,1,35,31,66,674
Olmsted Point,2,16,10,27,0,0,45,6,4,6,...,3,3,1,1,0,5,168,165,333,890
Tenaya Lake,2,25,1,41,0,26,0,3,1,7,...,1,2,0,2,0,2,123,128,251,626
Wildcat Falls,3,1,2,2,1,2,2,0,1,5,...,4,6,0,3,4,4,147,110,257,724
Mirror Lake,7,1,0,2,0,4,1,1,0,14,...,9,5,9,14,6,12,134,150,284,875
Vernal Falls,10,3,2,8,0,7,3,1,15,0,...,15,5,10,14,5,20,205,229,434,2349


In [13]:
# probability matrix of all trips in the National Park
pmatrix = prob_matrix(fmatrix)
pmatrix

Unnamed: 0_level_0,a,b,c,d,e,f,g,h,i,j,...,l,m,n,o,p,q,r,s,t,u
Places,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mariposa Grove of Giant Sequoias,0.0,0.007407,0.007407,0.022222,0.074074,0.007407,0.02963,0.037037,0.007407,0.051852,...,0.296296,0.051852,0.022222,0.185185,0.014815,0.037037,0.007407,0.014815,0.037037,0.044444
Tioga Lake,0.018018,0.0,0.0,0.405405,0.009009,0.162162,0.108108,0.072072,0.0,0.009009,...,0.027027,0.045045,0.045045,0.045045,0.0,0.009009,0.0,0.0,0.0,0.036036
Tuolumne Grove,0.0,0.015385,0.0,0.015385,0.0,0.107692,0.030769,0.184615,0.0,0.061538,...,0.123077,0.123077,0.061538,0.076923,0.046154,0.015385,0.0,0.092308,0.030769,0.0
Tuolumne Meadows,0.033113,0.245033,0.046358,0.0,0.0,0.218543,0.205298,0.019868,0.006623,0.046358,...,0.039735,0.013245,0.013245,0.039735,0.013245,0.019868,0.006623,0.006623,0.0,0.006623
Yosemite West,0.142857,0.057143,0.028571,0.0,0.0,0.057143,0.028571,0.028571,0.028571,0.028571,...,0.171429,0.085714,0.0,0.142857,0.0,0.057143,0.057143,0.057143,0.0,0.028571
Olmsted Point,0.011905,0.095238,0.059524,0.160714,0.0,0.0,0.267857,0.035714,0.02381,0.035714,...,0.035714,0.083333,0.047619,0.053571,0.017857,0.017857,0.005952,0.005952,0.0,0.029762
Tenaya Lake,0.01626,0.203252,0.00813,0.333333,0.0,0.211382,0.0,0.02439,0.00813,0.056911,...,0.02439,0.00813,0.02439,0.00813,0.00813,0.01626,0.0,0.01626,0.0,0.01626
Wildcat Falls,0.020408,0.006803,0.013605,0.013605,0.006803,0.013605,0.013605,0.0,0.006803,0.034014,...,0.231293,0.217687,0.163265,0.07483,0.027211,0.040816,0.0,0.020408,0.027211,0.027211
Mirror Lake,0.052239,0.007463,0.0,0.014925,0.0,0.029851,0.007463,0.007463,0.0,0.104478,...,0.126866,0.067164,0.044776,0.08209,0.067164,0.037313,0.067164,0.104478,0.044776,0.089552
Vernal Falls,0.04878,0.014634,0.009756,0.039024,0.0,0.034146,0.014634,0.004878,0.073171,0.0,...,0.117073,0.063415,0.073171,0.131707,0.073171,0.02439,0.04878,0.068293,0.02439,0.097561


In [14]:
# split the trips and generate probability matrix for each month
# the output is provided in the data folder --> acadia_pmatrix_example

for i in range(1,13):
    df_sub = pd.DataFrame()
    df_sub = split_fmatrix(NP_trips,i)
    pmatrix_sub = prob_matrix(df_sub)
    #pmatrix_sub.to_csv('acadia_NP_cluster_prob_matrix_'+str(i)+'.csv')