This file is used to reproduce Section 3.2.2 and 3.2.3 in the paper as well as Table 1, Table 8 and 9.

Acadia National Park is used as an example. For Yosemite National Park, replace the input and position url in Chunk [4] and run the chunks below it.

### Extracting trip sequences from geotagged photos

In [1]:
!pip install -r requirements.txt



In [2]:
import pandas as pd
import numpy as np
import string

In [3]:
# to reproduce Table 1
acadia_ttl = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP.csv"
yosemite_ttl = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/yosemite_NP.csv"

def table_stats(input):
    df = pd.read_csv(input)
    total_pts = df.shape[0] # number of photos
    total_user = df['owner'].nunique() #number of users
    print("Num_of_photos:",total_pts, ", Num_of_users:", total_user)

table_stats(acadia_ttl)
#table_stats(yosemite_ttl)

Num_of_photos: 34933 , Num_of_users: 1879


In [4]:
# INPUT
# data retrieved using Flickr API after clustering, each photo is assigned with a cluster_id
acadia_url = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP_cluster.csv"
yosemite_url = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/yosemite_NP_cluster.csv"

# position of each attraction in park
acadia_position = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP_coords.csv"
yosemite_position = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/yosemite_NP_coords.csv"


input_url = acadia_url 
#input_url = yosemite_url

position_url = acadia_position
#position_url = yosemite_position

In [5]:
# The input csv is the data retrieved using Flickr API after clustering
# each photo is assigned with a cluster_id

df = pd.read_csv(input_url)

# data pre-processing
df['datetaken'] = pd.to_datetime(df['datetaken'])
df['date'] = [d.date() for d in df['datetaken']]
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['time'] = [d.time() for d in df['datetaken']]
df.Cluster = [chr(ord('a') + x) for x in df.Cluster ]
df.head()

Unnamed: 0,index,id,owner,datetaken,latitude,longitude,title,accuracy,views,Cluster,date,year,month,time
0,0,8918787381,74212514@N04,2010-01-10 15:50:46,44.354492,-68.051204,Acadia National Park,12.0,793,a,2010-01-10,2010,1,15:50:46
1,1,29498596186,74212514@N04,2010-01-10 16:03:20,44.354492,-68.051204,Maine - Acadia National Park,12.0,5829,a,2010-01-10,2010,1,16:03:20
2,2,8919396564,74212514@N04,2010-01-10 16:15:59,44.354492,-68.051204,DSC03484,12.0,55,a,2010-01-10,2010,1,16:15:59
3,3,8918780331,74212514@N04,2010-01-10 16:31:06,44.354492,-68.051204,DSC03491,12.0,57,a,2010-01-10,2010,1,16:31:06
4,4,8918778905,74212514@N04,2010-01-10 16:42:40,44.354492,-68.051204,DSC03498,12.0,67,a,2010-01-10,2010,1,16:42:40


In [6]:
from datetime import timedelta
import collections

# preprocess: sort values and set owners as index
def clean_data(input):
    input = input[['owner','year','month','date','Cluster']]
    input.sort_values(['owner', 'year','month','date'], ascending=True, inplace=True)
    input.set_index(['owner'],inplace=True)
    return input


# construct trips by getting temporally-ordered sequence of photo
# a time threshold of 4 days is used to distinguish separate trips from the same user
def split_trip(input):
    delta, trip_id = [],[]
    counts = collections.Counter(input.index)
    unique_users_list = input.index.unique()
    
    for i in range(len(unique_users_list)):
        user = unique_users_list[i]
        subset = input.loc[user]

        if counts[user] == 1:
            delta.append(0)
            trip_id.append(1)
        else:
            delta.append(0)
            trip_id.append(1)
            for j in range(counts[user]-1):
                length = subset['date'].values[j+1] - subset['date'].values[j]
                delta.append(length.days)
                if length.days > 4: #time threshold: average length of stay in both NPs
                    trip_id.append(trip_id[-1] + 1)
                else:
                    trip_id.append(trip_id[-1])
    input.loc[:,'duration'] = delta
    input.loc[:,'trip_id'] = trip_id
    return input

# form dataframe for constructed trips, i.e., od: [a,a,a,b,c]
def get_OD_trips(input):
    trips = pd.DataFrame()
    user,year,month, trip_id, od = [],[],[],[],[]
    index = input.index.unique()   
    for i in range (len(index)):
        user.append(index[i][0])
        month.append(input.loc[index[i]]['month'].values[0])
        trip_id.append(index[i][1])
        od.append(input.loc[index[i]]['Cluster'].values)   
    trips['user'] = user
    trips['month'] = month
    trips['trip_id'] = trip_id
    trips['od'] = od   
    return trips

# removing consecutive duplicates from the od list, i.e., u_od: [a,b,c]
from itertools import groupby  
def get_unique_OD_trips(input):
    input['u_od'] = ""
    for i in range(len(input)): 
        res = [i[0] for i in groupby(input['od'][i])] 
        input['u_od'][i] = res
    return input

In [7]:
# remove duplicate within same owner, same date, same place id
OD = df.drop_duplicates(['owner','year','date','Cluster'],keep='first')

full = clean_data(OD)
full = split_trip(full)
full.set_index('trip_id',append=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [8]:
NP_trips = get_OD_trips(full)
NP_trips = get_unique_OD_trips(NP_trips)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Calculating visiting probabilities from trip sequences

In [9]:
# get trip segments from sequences i.e., [a,b] and [b,c] from [a,b,c]
from itertools import permutations

num_cluster = df.Cluster.nunique()
alphabet_string = string.ascii_lowercase
alphabet_list = list(alphabet_string)[:num_cluster]
key  = alphabet_list
output = sum([list(map(list, permutations(key, 2)))], [])
outstr = [', '.join(output[i]) for i in range(len(output))]
for i in range(len(key)):
    outstr.insert(i*(len(key)+1), key[i])

# number of total in each cluster
photos_num = [df[df.Cluster == x].shape[0] for x in alphabet_list]

# count number of trips from the segments
def get_flow(df, keys):
    dicts = dict.fromkeys(keys, 0) 
    for i in range(len(df['u_od'])):
        values = df['u_od'].iloc[i]
        for j in range(len(keys)):
            dicts[keys[j]] += values.count(keys[j])
    res = pd.DataFrame.from_dict(dicts, orient='index')
    return res

In [10]:
# construct flow matrix based on trip segments
# calculate number of incoming, outgoing and cross_boundary trips
def flow_matrix(df):
    dim = num_cluster
    flow_matrix = np.zeros((dim, dim), int)
    for i in range(dim):
        for j in range(dim):
            if i == j:
                flow_matrix[i][j] = 0
            else:
                flow_matrix[i][j] = df.values[j+dim*i]
    res = pd.DataFrame(data=flow_matrix, columns = alphabet_list)    
    res['total_out'] = res.sum(axis=1)
    res['total_in']= res.sum(axis=0)[:dim].values
    res['cross_boundary'] = res.loc[:,'total_out'].values+res.loc[:,'total_in'].values
    res['photos'] = photos_num
    res['Places'] = position['Clusters from Data'].values # get cluster names
    res = res.set_index('Places')
    return res


# calculate visiting probabilities from the flow matrix
def prob_matrix(df):
    pmatrix = df.iloc[:,:df.shape[0]].div(df.total_out, axis=0)
    pmatrix = pmatrix.fillna(0)
    pmatrix_df = pd.DataFrame(pmatrix, columns = alphabet_list)
    pmatrix_df['Places'] = position['Clusters from Data'].values
    pmatrix_df = pmatrix_df.set_index('Places')
    return pmatrix_df


# take subset of data by month
def subset_data(input,month):
    subset = input[input['month'] == month]
    return subset


# split the flow matrix into month
def split_fmatrix(trips, month):
    subset = subset_data(trips, month)
    flow = get_flow(subset,outstr) 
    return flow_matrix(flow)

In [11]:
NP_trips['u_od'] = [', '.join(x) for x in NP_trips['u_od']]
NP_trips

Unnamed: 0,user,month,trip_id,od,u_od
0,10016118@N04,10,1,"[j, k]","j, k"
1,100256002@N06,10,1,"[b, j]","b, j"
2,100327756@N02,5,1,"[e, e, e, e]",e
3,100508820@N04,10,1,"[e, j, m, b, g, l]","e, j, m, b, g, l"
4,100523630@N04,8,1,[f],f
...,...,...,...,...,...
1944,9965983@N05,6,1,"[g, j, m, l, k, e, b, c, c]","g, j, m, l, k, e, b, c"
1945,99693431@N07,7,1,"[c, c]",c
1946,99693431@N07,12,2,"[c, c]",c
1947,99718142@N07,10,1,[e],e


In [12]:
position = pd.read_csv(position_url)
position['coord'] = list(zip(position.Longitude, position.Latitude))

# flow matrix of all trips in the National Park
# column['total_out','total_in','photos'] to reproduce Table 8 and 9
flow = get_flow(NP_trips,outstr)
fmatrix = flow_matrix(flow)
fmatrix

Unnamed: 0_level_0,a,b,c,d,e,f,g,h,i,j,k,l,m,total_out,total_in,cross_boundary,photos
Places,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Schoodic Institute,0,13,6,1,11,1,8,0,0,3,3,2,5,53,64,117,1119
Bass Harbor,11,0,32,9,62,12,52,4,6,24,12,15,21,260,288,548,2298
Southwest Harbor,2,42,0,6,27,3,15,4,1,4,1,2,2,109,111,220,723
Northeast Harbor,5,15,8,0,13,1,7,0,2,10,1,2,3,67,76,143,605
Bar Harbor,20,59,24,21,0,17,117,3,12,49,15,40,56,433,357,790,6259
Wild Gardens of Acadia,1,3,1,2,10,0,5,1,1,6,4,11,15,60,66,126,550
Cadillac Mountain,8,55,12,12,102,16,0,0,14,51,12,24,43,349,345,694,3285
Penobscot Peak,2,3,3,2,2,0,0,0,0,2,0,1,1,16,15,31,776
Bubble Rock,1,16,5,0,18,1,13,2,0,17,3,3,4,83,89,172,703
Jordan Pond,6,36,4,10,48,5,51,1,44,0,3,7,12,227,250,477,1250


In [13]:
# probability matrix of all trips in the National Park
pmatrix = prob_matrix(fmatrix)
pmatrix

Unnamed: 0_level_0,a,b,c,d,e,f,g,h,i,j,k,l,m
Places,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Schoodic Institute,0.0,0.245283,0.113208,0.018868,0.207547,0.018868,0.150943,0.0,0.0,0.056604,0.056604,0.037736,0.09434
Bass Harbor,0.042308,0.0,0.123077,0.034615,0.238462,0.046154,0.2,0.015385,0.023077,0.092308,0.046154,0.057692,0.080769
Southwest Harbor,0.018349,0.385321,0.0,0.055046,0.247706,0.027523,0.137615,0.036697,0.009174,0.036697,0.009174,0.018349,0.018349
Northeast Harbor,0.074627,0.223881,0.119403,0.0,0.19403,0.014925,0.104478,0.0,0.029851,0.149254,0.014925,0.029851,0.044776
Bar Harbor,0.046189,0.136259,0.055427,0.048499,0.0,0.039261,0.270208,0.006928,0.027714,0.113164,0.034642,0.092379,0.12933
Wild Gardens of Acadia,0.016667,0.05,0.016667,0.033333,0.166667,0.0,0.083333,0.016667,0.016667,0.1,0.066667,0.183333,0.25
Cadillac Mountain,0.022923,0.157593,0.034384,0.034384,0.292264,0.045845,0.0,0.0,0.040115,0.146132,0.034384,0.068768,0.123209
Penobscot Peak,0.125,0.1875,0.1875,0.125,0.125,0.0,0.0,0.0,0.0,0.125,0.0,0.0625,0.0625
Bubble Rock,0.012048,0.192771,0.060241,0.0,0.216867,0.012048,0.156627,0.024096,0.0,0.204819,0.036145,0.036145,0.048193
Jordan Pond,0.026432,0.15859,0.017621,0.044053,0.211454,0.022026,0.22467,0.004405,0.193833,0.0,0.013216,0.030837,0.052863


In [14]:
# split the trips and generate probability matrix for each month
# the output is provided in the data folder --> acadia_pmatrix_example

for i in range(1,13):
    df_sub = pd.DataFrame()
    df_sub = split_fmatrix(NP_trips,i)
    pmatrix_sub = prob_matrix(df_sub)
    #pmatrix_sub.to_csv('acadia_NP_cluster_prob_matrix_'+str(i)+'.csv')