This file is used to reproduce Section 5 in the paper, Table 2-6.

Acadia National Park is used here as an example.

In [2]:
!pip install -r requirements.txt

Collecting numpy==1.20.2
  Using cached numpy-1.20.2-cp37-cp37m-manylinux2010_x86_64.whl (15.3 MB)
Collecting pandas==1.2.4
  Downloading pandas-1.2.4-cp37-cp37m-manylinux1_x86_64.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 4.5 MB/s eta 0:00:01
[?25hCollecting patsy==0.5.1
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 50.6 MB/s eta 0:00:01
[?25hCollecting scikit-learn==0.24.1
  Downloading scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 63.5 MB/s eta 0:00:01
[?25hCollecting scipy==1.6.3
  Downloading scipy-1.6.3-cp37-cp37m-manylinux1_x86_64.whl (27.4 MB)
[K     |████████████████████████████████| 27.4 MB 36.7 MB/s eta 0:00:01     |███████████████▊                | 13.5 MB 36.7 MB/s eta 0:00:01
[?25hCollecting sklearn==0.0
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting statsmodels==0.12.2
  Downloading statsmodels-0.12.2-cp37-cp37m

In [3]:
import pandas as pd
import numpy as np
import re
import string
import scipy
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from scipy import stats
import statsmodels.api as sm

In [4]:
def split_date(df):
    df['datetaken'] = pd.to_datetime(df['datetaken'])
    df['date'] = [d.date() for d in df['datetaken']]
    df['year'] = pd.DatetimeIndex(df['date']).year
    df['month'] = pd.DatetimeIndex(df['date']).month
    return df

def subset_data(input,month):
    subset = input[input['month'] == month]
    return subset

In [5]:
acadia_url = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP_cluster.csv"
position_url = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP_coords.csv"

position = pd.read_csv(position_url) 
position['coord'] = list(zip(position.Latitude, position.Longitude))

df = pd.read_csv(acadia_url)
df = split_date(df) 
df.head()

Unnamed: 0,index,id,owner,datetaken,latitude,longitude,title,accuracy,views,Cluster,date,year,month
0,0,8918787381,74212514@N04,2010-01-10 15:50:46,44.354492,-68.051204,Acadia National Park,12.0,793,0,2010-01-10,2010,1
1,1,29498596186,74212514@N04,2010-01-10 16:03:20,44.354492,-68.051204,Maine - Acadia National Park,12.0,5829,0,2010-01-10,2010,1
2,2,8919396564,74212514@N04,2010-01-10 16:15:59,44.354492,-68.051204,DSC03484,12.0,55,0,2010-01-10,2010,1
3,3,8918780331,74212514@N04,2010-01-10 16:31:06,44.354492,-68.051204,DSC03491,12.0,57,0,2010-01-10,2010,1
4,4,8918778905,74212514@N04,2010-01-10 16:42:40,44.354492,-68.051204,DSC03498,12.0,67,0,2010-01-10,2010,1


### Construct distance matrix and attractiveness matrix

In [4]:
# Calculate travel distance (in km) using google map distance matrix api
# import googlemaps
# API_key = 'xxxxx'
# gmaps = googlemaps.Client(key=API_key)

def get_dist_matrix(df):

    destinations = df.coord
    names = df['Clusters from Data'].values
    
    dim = len(destinations)
    dist_matrix = np.zeros((dim, dim), float)
    
    
    for i in range(dim):
        actual_distance = []
        origin = destinations[i]        
        for destination in destinations:
            result = gmaps.distance_matrix(origin, destination, mode='driving')['rows'][0]['elements'][0]['distance']['value']
            result = result/1000
            actual_distance.append(result)
        dist_matrix[i] = actual_distance
        
    res = pd.DataFrame(data=dist_matrix, index = names, columns=names)
    return res


# generate attractiveness matrix
def attr_matrix(df, month):
    attr_matrix = pd.DataFrame()
    df = subset_data(df, month)
    
    attr_matrix['Places'] = position['Clusters from Data'].values
    attr_matrix['photo_views'] = df.groupby(['Cluster'])['views'].agg('sum')
    attr_matrix['num_uploaders'] = df.groupby(['Cluster'])['owner'].nunique()
    attr_matrix['num_of_photos'] = df.groupby(['Cluster']).size()
    attr_matrix['avg_view_per_user'] = attr_matrix['photo_views']/attr_matrix['num_uploaders']
    attr_matrix['avg_view_per_photo'] = attr_matrix['photo_views']/attr_matrix['num_of_photos']
    
    # Different measurements of attractiveness in Section 5.1.2 and Table 2
    #attr_matrix['total_attr'] = attr_matrix['num_of_photos'] #Aj1
    #attr_matrix['total_attr'] = attr_matrix['num_uploaders'] #Aj2
    attr_matrix['total_attr'] = attr_matrix['num_of_photos'] * attr_matrix['avg_view_per_user'] #Aj3
    
    attr_matrix = attr_matrix.fillna(0)
    attr_matrix['total_attr_log'] = np.log(attr_matrix['total_attr']+1)
    attr_matrix = attr_matrix.set_index('Places')
    return attr_matrix


# attractiveness without temporal component, used for model comparison in Section 5.1.3 and Table 3
def attr_matrix_all(df):
    attr_matrix = pd.DataFrame() 
    attr_matrix['Places'] = position['Clusters from Data'].values
    attr_matrix['photo_views'] = df.groupby(['Cluster'])['views'].agg('sum')
    attr_matrix['num_uploaders'] = df.groupby(['Cluster'])['owner'].nunique()
    attr_matrix['num_of_photos'] = df.groupby(['Cluster']).size()
    attr_matrix['avg_view_per_user'] = attr_matrix['photo_views']/attr_matrix['num_uploaders']
    attr_matrix['avg_view_per_photo'] = attr_matrix['photo_views']/attr_matrix['num_of_photos']
    attr_matrix['total_attr'] = attr_matrix['num_of_photos'] * attr_matrix['avg_view_per_user']
    attr_matrix['total_attr_log'] = np.log(attr_matrix['total_attr'])
    attr_matrix = attr_matrix.set_index('Places')
    return attr_matrix

# to include the neighboring effect
# select K neighbors
def neighbors(dest, dist_matrix, K):
    destinations = dist_matrix.index.values
    dist_tp = np.transpose(dist_matrix)
    neighbors = dist_tp.nsmallest(10, [dest])[1:K+1].index.values   
    return neighbors

# calculate centrality score based on K neighbors, attraction matrix and distance matrix
def centrality(dest, attr_matrix, K):
    neighbor_lst = neighbors(dest, dist_matrix, K)
    c = 0
    dist = 0
    for p in neighbor_lst:
        c += attr_matrix.loc[p]['total_attr_log']/dist_matrix.loc[dest][p]
        dist += dist_matrix.loc[dest][p]
        c = c/dist
    return c

In [5]:
# generate a distance matrix by distance matrix API
# dist_matrix = get_dist_matrix(position)

# Here a download version of distance matrix is used.
dist_matrix_url = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_NP_dist_matrix.csv"
dist_matrix = pd.read_csv(dist_matrix_url,index_col=0)
dist_matrix.head()

Unnamed: 0,Schoodic Institute,Bass Harbor,Southwest Harbor,Northeast Harbor,Bar Harbor,Wild Gardens of Acadia,Cadillac Mountain,Penobscot Peak,Bubble Rock,Jordan Pond,Boulder Beach,Thunder Hole,Sand Beach
Schoodic Institute,0.0,81.026,74.132,73.382,73.512,77.424,84.215,69.457,82.548,85.141,82.609,83.429,84.342
Bass Harbor,84.206,0.0,8.08,26.813,29.721,32.39,34.466,22.888,32.798,35.392,37.576,38.395,39.309
Southwest Harbor,77.312,8.08,0.0,19.919,22.828,25.497,27.572,15.994,25.905,28.498,30.682,31.502,32.415
Northeast Harbor,76.562,26.813,19.919,0.0,17.66,14.353,22.404,3.925,20.737,23.33,14.331,15.151,16.064
Bar Harbor,76.747,29.791,22.898,17.73,0.0,4.416,10.117,13.805,8.45,11.043,9.602,10.422,11.335


### Ordinary Least Squares (OLS) Calibration

In [6]:
def getComplement(item, lst):
    results = []
    for num in lst:
        if num != item: 
            results.append(num)
    return results

# OLS dependent variable
def read_actual(pmatrix, origin):
    num = 0
    denom = 0
    result = []
    places = position['Clusters from Data'].values
    dests = getComplement(origin, places)   
    actual_pmatrix = pd.read_csv(pmatrix, index_col=0)
    for i in range(len(dests)):
        num = actual_pmatrix.loc[origin].values[i]
        denom = np.mean(actual_pmatrix.loc[origin])
        result.append(num/denom)
    return result

# OLS independent variables
# attractiveness (including Social Influence), distance, centrality
def log_transform_x(origin,K,month):
    X1, X2, X3 = [],[],[]
    total_centrality = 0
    places = position['Clusters from Data'].values
    dests = getComplement(origin, places)
    attr_mat = attr_matrix(df, month)
    #attr_mat = attr_matrix_all(df) # attr_matrix without temporal component, used for model comparison in Section 5.1.3 and Table 3
    for dest in dests:
        total_centrality += centrality(dest, attr_mat, K)
        X1.append(attr_mat.loc[dest]['total_attr_log']/np.mean(attr_mat['total_attr_log']))
        X2.append(dist_matrix.loc[origin][dest]/ np.mean(dist_matrix.loc[origin]))
        X3.append(centrality(dest, attr_mat, K)/(total_centrality/len(dests)))
    var_table = pd.DataFrame()
    X1 = [x + 1 for x in X1]
    X3 = [x + 1 for x in X3]
    var_table['x1'] = np.nan_to_num(np.log(X1))
    var_table['x2'] = np.nan_to_num(np.log(X2))
    var_table['x3'] = np.nan_to_num(np.log(X3))
    return var_table

In [21]:
# acadia probability matrix example for Y values
places = position['Clusters from Data'].values

acadia_pmatrix_url = "https://raw.githubusercontent.com/meilinshi/Socially-aware-Huff-model/main/Data/acadia_pmatrix_example/acadia_NP_cluster_prob_matrix_"
pmatrix_lst = [acadia_pmatrix_url+str(i)+".csv" for i in range(1,13)]

In [46]:
# for all trips in park (Section 5.1.2 and 5.1.3, Table 2 and 3)
Y_res = []
for place in places:
    for file in pmatrix_lst:
        Y = read_actual(file, place)
        log_Y = np.nan_to_num(np.log(Y))
        Y_res = np.append(Y_res, np.round(log_Y,10)) #reading from url seems to have a rounding issue (just in case)
        
        
# for place related trips in park (Section 5.2 and 5.3, Table 4, 5, 6)
# Y_res = []
# place_name = places[1] # pick a specific place, i.e. 1 is Bass Harbor
# for file in pmatrix_lst:
    # Y = read_actual(file, place_name)
    # log_Y = np.nan_to_num(np.log(Y))
    # Y_res = np.append(Y_res, np.round(log_Y,10))

  
  return bound(*args, **kwds)


In [47]:
var_table = []
for place in places:
    for i in range(1,13): #The range here can be changed to summer/winter months (Section 5.3 and Table 6)
        tbl = log_transform_x(place,2,i)
        var_table.append(tbl)
df_allmonth = pd.concat(var_table)

#### An example of all trips in Acadia National Park,  $R^2$ = 0.753

In [48]:
## fit a OLS model on the three parameters
df_allmonth['Y'] = Y_res
df_allmonth = df_allmonth[df_allmonth.Y > 0]
df_allmonth = df_allmonth[df_allmonth.x1 != 0]

#X = df_allmonth[['x1', 'x2']] # for model comparison in Section 5.1.3 and Table 3
X = df_allmonth[['x1', 'x2','x3']]
Y = df_allmonth['Y']

results = sm.OLS(Y,X).fit()
print('Parameters: ', results.params)
print('R2: ', results.rsquared)
print('MSE ',results.mse_resid)
print('AIC: ', results.aic)

Parameters:  x1    1.019533
x2   -0.107049
x3    0.145088
dtype: float64
R2:  0.7528155298087753
AIC:  709.6799332827493
