In [5]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2

def calculate_distances(listings_df, venues_df, output_file):
    # Function to calculate Haversine distance
    def haversine_distance(lat1, lon1, lat2, lon2):
        R = 6371  # Earth's radius in kilometers

        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1

        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        distance = R * c

        return distance

    # Calculate distances and find closest venue for each listing
    closest_venues = []
    closest_distances = []

    for _, listing in listings_df.iterrows():
        distances = venues_df.apply(lambda venue: haversine_distance(
            listing['latitude'], listing['longitude'],
            venue['latitude'], venue['longitude']
        ), axis=1)
        
        closest_index = distances.idxmin()
        closest_venue = venues_df.loc[closest_index, 'venue']
        closest_distance = distances[closest_index]

        closest_venues.append(closest_venue)
        closest_distances.append(closest_distance)

    # Add closest venue and distance to listings dataframe
    listings_df['closest_venue'] = closest_venues
    listings_df['distance_to_closest_venue'] = closest_distances

    # Save updated listings to CSV
    listings_df.to_csv(output_file, index=False)

    print(f"Updated listings saved to {output_file}")

# Load the cleaned and merged dataset
listing_df = pd.read_csv('raw_data/merged/merged_listings.csv')
venue_df = pd.read_csv('paris_2024_data/venues_with_corrected_neighborhoods_and_capacity.csv')

calculate_distances(listing_df, venue_df, 'raw_data/merged/summary_venues.csv')

Updated listings saved to raw_data/merged/summary_venues.csv
