#### Extracting trip sequences from geotagged photos

In [1]:
# input is the data retrieved using Flickr API after clustering
# each photo is assigned with a cluster_id

from datetime import timedelta
import collections

# preprocess: sort values and set owners as index
def clean_data(input):
    input = input[['owner','year','month','date','Cluster']]
    input.sort_values(['owner', 'year','month','date'], ascending=True, inplace=True)
    input.set_index(['owner'],inplace=True)
    return input


# construct trips by getting temporally-ordered sequence of photo
# a time threshold of 4 days is used to distinguish separate trips from the same user
def split_trip(input):
    delta, trip_id = [],[]
    counts = collections.Counter(input.index)
    unique_users_list = input.index.unique()
    
    for i in range(len(unique_users_list)):
        user = unique_users_list[i]
        subset = input.loc[user]

        if counts[user] == 1:
            delta.append(0)
            trip_id.append(1)
        else:
            delta.append(0)
            trip_id.append(1)
            for j in range(counts[user]-1):
                length = subset['date'].values[j+1] - subset['date'].values[j]
                delta.append(length.days)
                if length.days > 4: #time threshold: average length of stay in both NPs
                    trip_id.append(trip_id[-1] + 1)
                else:
                    trip_id.append(trip_id[-1])
    input.loc[:,'duration'] = delta
    input.loc[:,'trip_id'] = trip_id
    return input

# form dataframe for constructed trips, i.e., od: [a,a,a,b,c]
def get_OD_trips(input):
    trips = pd.DataFrame()
    user,year,month, trip_id, od = [],[],[],[],[]
    index = input.index.unique()   
    for i in range (len(index)):
        user.append(index[i][0])
        month.append(input.loc[index[i]]['month'].values[0])
        trip_id.append(index[i][1])
        od.append(input.loc[index[i]]['Cluster'].values)   
    trips['user'] = user
    trips['month'] = month
    trips['trip_id'] = trip_id
    trips['od'] = od   
    return trips

# removing consecutive duplicates from the od list, i.e., u_od: [a,b,c]
from itertools import groupby  
def get_unique_OD_trips(input):
    input['u_od'] = ""
    for i in range(len(input)): 
        res = [i[0] for i in groupby(input['od'][i])] 
        input['u_od'][i] = res
    return input

#### Calculating visiting probabilities from trip sequences

In [None]:
# get trip segments from sequences i.e., [a,b] and [b,c] from [a,b,c]
from itertools import permutations

num_cluster = input.Cluster.nunique()
alphabet_string = string.ascii_lowercase
alphabet_list = list(alphabet_string)[:num_cluster]
key  = alphabet_list
output = sum([list(map(list, permutations(key, 2)))], [])
outstr = [', '.join(output[i]) for i in range(len(output))]
for i in range(len(key)):
    outstr.insert(i*(len(key)+1), key[i])
    
# count number of trips from the segments
def get_flow(df, keys):
    dicts = dict.fromkeys(keys, 0) 
    for i in range(len(df['u_od'])):
        values = df['u_od'].iloc[i]
        for j in range(len(keys)):
            dicts[keys[j]] += values.count(keys[j])
    res = pd.DataFrame.from_dict(dicts, orient='index')
    return res

In [None]:
# construct flow matrix based on trip segments
# calculate number of incoming, outgoing and cross_boundary trips
def flow_matrix(df):
    dim = num_cluster
    flow_matrix = np.zeros((dim, dim), int)
    for i in range(dim):
        for j in range(dim):
            if i == j:
                flow_matrix[i][j] = 0
            else:
                flow_matrix[i][j] = df.values[j+dim*i]
    res = pd.DataFrame(data=flow_matrix, columns = alphabet_list)    
    res['total_out'] = res.sum(axis=1)
    res['total_in']= res.sum(axis=0)[:dim].values
    res['cross_boundary'] = res.loc[:,'total_out'].values+res.loc[:,'total_in'].values
    res['Places'] = position['Clusters from Data'].values # get cluster names
    res = res.set_index('Places')
    return res

# calculate visiting probabilities from the flow matrix
def prob_matrix(df):
    pmatrix = df.iloc[:,:df.shape[0]].div(df.total_out, axis=0)
    pmatrix = pmatrix.fillna(0)
    pmatrix_df = pd.DataFrame(pmatrix, columns = alphabet_list)
    pmatrix_df['Places'] = position['Clusters from Data'].values
    pmatrix_df = pmatrix_df.set_index('Places')
    return pmatrix_df