In [None]:
# Split the raw data into city level data

import pandas as pd
import numpy as np
import pickle

poi_data_1= pd.read_csv('yongliu_gowalla_data/gowalla_spots_subset1.csv')
print(len(poi_data_1), 'rows of data')
poi_data_1

In [None]:
import chardet

# Read the first few lines of the file to detect the encoding
with open('yongliu_gowalla_data/gowalla_spots_subset2.csv', 'rb') as f:
    raw_data = f.read(200000)
    result = chardet.detect(raw_data)

# get the encoding type
encoding = result['encoding']
print(f"Detected encoding: {encoding}")
poi_data_2= pd.read_csv('yongliu_gowalla_data/gowalla_spots_subset2.csv',encoding='Windows-1252')
print(len(poi_data_2),'rows of data')
poi_data_2

# It can be seen that the number of rows of poi_data_1 and data_2 is different, indicating that their data are consistent

In [None]:
# Use map and lambda to skip NaNs and extract the first element of the non-NaN item
city_all = list(map(lambda x: x.split(',')[0] if not pd.isna(x) else None, list(set(poi_data_2['city_state']))))
# Remove None items from the result
city_all = [item for item in city_all if item is not None]
len(city_all)

In [None]:
# Match Chicago
# Select by latitude and longitude
# To filter again by latitude and longitude, just select stores within 25km of the city center
# ber_lat_lon_center= ( 52.5190838018783, 13.401522103237626) # Berlin Cathedral
chi_lat_lon_center= ( 41.87950259199219, -87.6225409728181) # Art Institute of Chicago

# Formula for calculating distance based on longitude and latitude
import math

def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Radius of Earth in kilometers. Use 3956 for miles. Determines return value units.
    R = 6371.0

    # Calculate the distance
    distance = R * c

    return distance  # The distance returned is in km

# Calculate distance for each business_id
poi_data_1['distance_to_center'] = poi_data_1.apply(
    lambda row: haversine(row['lat'], row['lng'], chi_lat_lon_center[0], chi_lat_lon_center[1]),
    axis=1
)

# Display the DataFrame with the new distance column
# Filter out rows where 'distance_to_center' is greater than 25
chi_filtered_poi_1 = poi_data_1[poi_data_1['distance_to_center'] <= 25]

# Display the resulting DataFrame
chi_filtered_poi_1 = chi_filtered_poi_1.reset_index(drop=True)
len(chi_filtered_poi_1)

In [None]:
# # Delete the rows where the count column is less than 10. For the Berlin data, you donâ€™t need to delete it, and the total amount is not large.
chi_filtered_poi_1 = chi_filtered_poi_1[chi_filtered_poi_1['checkins_count'] >= 10]
chi_filtered_poi_1_list = list(chi_filtered_poi_1['id'])
len(chi_filtered_poi_1_list)

In [None]:
poi_list = list(chi_filtered_poi_1['id'])
# Now we can find users based on the checkins on these POIs.

check_in_data =pd.read_csv('yongliu_gowalla_data/gowalla_checkins.csv')
check_in_data

In [None]:
check_in_chi = check_in_data[check_in_data['placeid'].isin(poi_list)]
print(len(check_in_chi))
check_in_chi

In [None]:
user_list = list(set(check_in_chi['userid']))
len(user_list)

In [None]:
friend_ship = pd.read_csv('yongliu_gowalla_data/gowalla_friendship.csv')
friend_ship

In [None]:
#Filter friend_ship and keep rows that meet the condition
filtered_friend_ship = friend_ship[
    (friend_ship['userid1'].isin(user_list)) & (friend_ship['userid2'].isin(user_list))
]
filtered_friend_ship = filtered_friend_ship.reset_index(drop=True)
print(len(filtered_friend_ship),'friendship edges')

In [None]:
len(filtered_friend_ship)

In [None]:
filtered_friend_ship

In [None]:
# Create a new column 'user_pair', which is a sorted combination of (userid1, userid2)
filtered_friend_ship['user_pair'] = filtered_friend_ship.apply(lambda row: tuple(sorted([row['userid1'], row['userid2']])), axis=1)

# Check if there are duplicate 'user_pair' entries
duplicates = filtered_friend_ship[filtered_friend_ship.duplicated('user_pair', keep=False)]

# If duplicates exist, output the duplicate records
if not duplicates.empty:
    print("Duplicate user pairs exist:", len(duplicates))  # Indicates that this friendship is stored bilaterally
    print(duplicates)
else:
    print("No duplicate user pairs.")

print(len(duplicates)/2, " unique pairs, as a hypergraph does not require duplicate pairs")


In [None]:
# # Only keep the first occurrence of a duplicate, and delete subsequent duplicates
friend_ship_unique = filtered_friend_ship.drop_duplicates('user_pair', keep='first')
len(friend_ship_unique)

In [None]:
import itertools
user_list_in_friend_ship = list( set( list(itertools.chain(list( friend_ship_unique['userid1']  ), list( friend_ship_unique['userid2']  )))))
print(len(user_list_in_friend_ship),'appeared in the friend network')

In [None]:
#Filter check_in again based on this list
check_in_chi_user_in_friend = check_in_chi[check_in_chi['userid'].isin(user_list_in_friend_ship)]
print(len(check_in_chi_user_in_friend),'check-in records, filtered')

In [None]:
check_in_chi_user_in_friend

In [None]:
check_in_chi_user_in_friend.to_csv('yongliu_gowalla_data/Chicago/check_in_chi_user_in_friend.csv', index=False)
friend_ship_unique.to_csv('yongliu_gowalla_data/Chicago/friend_ship_chi.csv', index=False)

In [None]:
# Further filter the POI information for Chicago

chi_filtered_poi_1 = chi_filtered_poi_1[chi_filtered_poi_1['id'].isin(set(check_in_chi_user_in_friend['placeid']))]
print(len(chi_filtered_poi_1), "POIs appeared in the check-in records, and the users of these check-ins have a friendship network")
chi_filtered_poi_1.to_csv('yongliu_gowalla_data/Chicago/chi_poi_incheckin_and_friend.csv', index=False)