In [None]:
import pandas as pd
import numpy as np
import re
import string
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from scipy import stats
import statsmodels.api as sm
import glob

#### Construct distance matrix and attractiveness matrix

In [1]:
# Calculate travel distance (in km) using google map distance matrix api
import googlemaps
API_key = 'AIzaSyBRtFBIA_8WBGTxDLxp1SLMrT3sYsMccwA'
gmaps = googlemaps.Client(key=API_key)

def get_dist_matrix(df):

    destinations = df.coord
    names = df['Clusters from Data'].values
    
    dim = len(destinations)
    dist_matrix = np.zeros((dim, dim), float)
    
    
    for i in range(dim):
        actual_distance = []
        origin = destinations[i]        
        for destination in destinations:
            result = gmaps.distance_matrix(origin, destination, mode='driving')['rows'][0]['elements'][0]['distance']['value']
            result = result/1000
            actual_distance.append(result)
        dist_matrix[i] = actual_distance
        
    res = pd.DataFrame(data=dist_matrix, index = names, columns=names)
    return res

# generate attractiveness matrix
def attr_matrix(df, month):
    attr_matrix = pd.DataFrame()
    df = subset_data(df, month)
    
    attr_matrix['Places'] = position['Clusters from Data'].values
    attr_matrix['photo_views'] = df.groupby(['Cluster'])['views'].agg('sum')
    attr_matrix['num_uploaders'] = df.groupby(['Cluster'])['owner'].nunique()
    attr_matrix['num_of_photos'] = df.groupby(['Cluster']).size()
    attr_matrix['avg_view_per_user'] = attr_matrix['photo_views']/attr_matrix['num_uploaders']
    attr_matrix['avg_view_per_photo'] = attr_matrix['photo_views']/attr_matrix['num_of_photos']
    attr_matrix['total_attr'] = attr_matrix['num_of_photos'] * attr_matrix['avg_view_per_user']
    attr_matrix = attr_matrix.fillna(0)
    attr_matrix['total_attr_log'] = np.log(attr_matrix['total_attr']+1)
    attr_matrix = attr_matrix.set_index('Places')
    return attr_matrix

In [2]:
# to include the neighboring effect
# select K neighbors
def neighbors(dest, dist_matrix, K):
    destinations = dist_matrix.index.values
    dist_tp = np.transpose(dist_matrix)
    neighbors = dist_tp.nsmallest(10, [dest])[1:K+1].index.values   
    return neighbors

# calculate centrality score based on K neighbors, attraction matrix and distance matrix
def centrality(dest, attr_matrix, K):
    neighbor_lst = neighbors(dest, dist_matrix, K)
    c = 0
    dist = 0
    for p in neighbor_lst:
        c += attr_matrix.loc[p]['total_attr_log']/dist_matrix.loc[dest][p]
        dist += dist_matrix.loc[dest][p]
        c = c/dist
    return c

#### Ordinary Least Squares (OLS) Calibration

In [3]:
def getComplement(item, lst):
    results = []
    for num in lst:
        if num != item: 
            results.append(num)
    return results

# OLS dependent variable
def read_actual(pmatrix, origin):
    num = 0
    denom = 0
    result = []
    places = position['Clusters from Data'].values
    dests = getComplement(origin, places)   
    actual_pmatrix = pd.read_csv(pmatrix, index_col=0)
    for i in range(len(dests)):
        num = actual_pmatrix.loc[origin].values[i]
        denom = np.mean(actual_pmatrix.loc[origin])
        result.append(num/denom)
    return result

# OLS independent variables
# attractiveness (including Social Influence), distance, centrality
def log_transform_x(origin,K,month):
    X1, X2, X3 = [],[],[]
    total_centrality = 0
    places = position['Clusters from Data'].values
    dests = getComplement(origin, places)
    attr_mat = attr_matrix(df, month)
    for dest in dests:
        total_centrality += centrality(dest, attr_mat, K)
        X1.append(attr_mat.loc[dest]['total_attr_log']/np.mean(attr_mat['total_attr_log']))
        X2.append(dist_matrix.loc[origin][dest]/ np.mean(dist_matrix.loc[origin]))
        X3.append(centrality(dest, attr_mat, K)/(total_centrality/len(dests)))
    var_table = pd.DataFrame()
    X1 = [x + 1 for x in X1]
    X3 = [x + 1 for x in X3]
    var_table['x1'] = np.nan_to_num(np.log(X1))
    var_table['x2'] = np.nan_to_num(np.log(X2))
    var_table['x3'] = np.nan_to_num(np.log(X3))
    return var_table

## fit a OLS model on the three parameters
df_allmonth['Y'] = Y_res
df_allmonth = df_allmonth[df_allmonth.Y > 0]

X = df_allmonth[['x1', 'x2','x3']]
Y = df_allmonth['Y']

results = sm.OLS(Y,X).fit()
print(results.summary())

In [None]:
# to compare with the classic Huff model with time
def Huff_Model(df, origin, dest,alpha, beta, month):
    places = position['Clusters from Data'].values
    dests = getComplement(origin, places)    
    denom = 0
    attr_mat = attr_matrix(df, month)
    if dist_matrix.loc[origin][dest] > 0:
        numer = (attr_mat.loc[dest]['total_attr_log']**alpha) * (dist_matrix.loc[origin][dest]**beta)
    else:
        numer = 0
    for dest in dests:
        denom += (attr_mat.loc[dest]['total_attr_log']**alpha) * (dist_matrix.loc[origin][dest]**beta)
    return numer/denom

## SA-Huff Model with time and centrality
def Huff_Model_with_centrality(df, origin, dest, alpha, beta, theta, month, K):
    places = position['Clusters from Data'].values
    dests = getComplement(origin, places)    
    denom = 0
    attr_mat = attr_matrix(df, month)
    if dist_matrix.loc[origin][dest] > 0:
        numer = attr_mat.loc[dest]['total_attr_log']**alpha * (dist_matrix.loc[origin][dest]**beta) * centrality(dest, attr_mat, K)**theta
    else:
        numer = 0
    for dest in dests:
        denom += attr_mat.loc[dest]['total_attr_log']**alpha * (dist_matrix.loc[origin][dest]**beta) * centrality(dest, attr_mat, K)**theta
    return numer/denom