In [18]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
from sklearn.cluster import DBSCAN

In [8]:
# Load the data
df = pd.read_csv('5G - Passive Measurements.csv')
df = df.sample(frac=0.1, random_state=1)

# Parse the 'Time' column to get the hour
df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M:%S.%f').dt.hour

# Convert Latitude and Longitude to GeoDataFrame
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude), crs="EPSG:4326")

In [69]:
# Plot for each hour and save as an image
for hour in sorted(df['Hour'].unique()):
    hourly_data = df[df['Hour'] == hour]
    
    # Create a plot for the hour
    fig, ax = plt.subplots(figsize=(10, 6))
    hourly_data.plot(ax=ax, color='red', alpha=0.6, markersize=12)
    
    # Set fixed x and y axis limits
    ax.set_xlim([12.28, 12.63])  # Longitude range
    ax.set_ylim([41.78, 42])  # Latitude range
    
    # Add OSM tiles as the background
    ctx.add_basemap(ax, crs=hourly_data.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
    
    # Customize plot
    plt.title(f'OpenStreetMap Plot for Hour {hour}')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    
    # Save each plot as an image
    plt.savefig(f'/Users/amangupta/Desktop/osm_plot_{hour}.png')
    plt.close()


In [15]:
def apply_dbscan(df, sample_size=None, random_state=None):
    hourly_medians = {}

    if 'Hour' not in df.columns:
        raise ValueError("DataFrame must contain 'Hour' column")

    for hour in sorted(df['Hour'].unique()):
        hourly_data = df[df['Hour'] == hour]

        
        if sample_size and len(hourly_data) > sample_size:
            hourly_data = hourly_data.sample(n=sample_size, random_state=random_state, replace=True)

        
        if len(hourly_data) < 2:
            print(f"Not enough data for hour {hour}, skipping.")
            continue
        
        coordinates = hourly_data[['Longitude', 'Latitude']].values
        
        try:
            db = DBSCAN(eps=0.01, min_samples=100).fit(coordinates)
            hourly_data['Cluster'] = db.labels_ 

            median_values = hourly_data.groupby('Cluster').agg(
                Median_Latitude=('Latitude', 'median'),
                Median_Longitude=('Longitude', 'median'),
                Count=('Cluster', 'size')
            ).reset_index()
            
            
            hourly_medians[hour] = median_values
            
        except Exception as e:
            print(f"Error processing hour {hour}: {e}")
            continue

    return hourly_medians

hourly_median_results = apply_dbscan(df, sample_size=None, random_state=42)

for hour, medians in hourly_median_results.items():
    print(f"Hour: {hour}, Medians:\n{medians}\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

Hour: 7, Medians:
   Cluster  Median_Latitude  Median_Longitude  Count
0        0         41.87177          12.46557  11664

Hour: 8, Medians:
   Cluster  Median_Latitude  Median_Longitude  Count
0        0        41.898280         12.498160  29620
1        1        41.871405         12.464336  14008

Hour: 9, Medians:
   Cluster  Median_Latitude  Median_Longitude  Count
0        0        41.871405         12.464265  32760
1        1        41.893390         12.494340  30930
2        2        41.876226         12.521511   3152

Hour: 10, Medians:
   Cluster  Median_Latitude  Median_Longitude  Count
0        0        41.893497          12.49415  64059
1        1        41.872525          12.52572  34544
2        2        41.871850          12.46368  37748

Hour: 11, Medians:
   Cluster  Median_Latitude  Median_Longitude  Count
0        0         41.87140         12.464240  40286
1        1         41.87489         12.523029  18117
2        2         41.89325         12.494440  29322

Ho

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [35]:
for hour, medians_values in hourly_median_results.items():
    total_count = median_values['Count'].sum()
    medians_values['Percentage'] = median_values['Count']/total_count
    fig, ax = plt.subplots(figsize=(10, 8))
    # Plot each cluster point
    for _, row in medians_values.iterrows():
        ax.scatter(
            row['Median_Longitude'], 
            row['Median_Latitude'], 
            s=row['Percentage'] * 10000,  # Scale size, adjust as needed
            alpha=0.3,  # Transparency
            color='blue'
        )

    # Set limits
    ax.set_xlim([12.28, 12.63])  # Longitude range
    ax.set_ylim([41.78, 42])      # Latitude range

    # Add OSM basemap
    ctx.add_basemap(ax, crs="EPSG:4326", source=ctx.providers.OpenStreetMap.Mapnik)

    # Add titles and labels
    ax.set_title(f'Cluster Points on OSM Map - Hour {hour}')
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')

    plt.savefig(os.path.join( f'combined_plot_hour_{hour}.png'))
    plt.close()  # Close the plot to avoid display

In [36]:
for hour in sorted(df['Hour'].unique()):
    hourly_data = df[df['Hour'] == hour]
    
    # Create a plot for the hour
    fig, ax = plt.subplots(figsize=(10, 6))
    hourly_data.plot(ax=ax, color='red', alpha=0.6, markersize=12)

    for _, row in hourly_median_results[hour].iterrows():
        ax.scatter(
            row['Median_Longitude'], 
            row['Median_Latitude'], 
            s=row['Percentage'] * 10000,  # Scale size, adjust as needed
            alpha=0.3,  # Transparency
            color='blue'
        )
    # Set fixed x and y axis limits
    ax.set_xlim([12.28, 12.63])  # Longitude range
    ax.set_ylim([41.78, 42])  # Latitude range

    
    # Add OSM tiles as the background
    ctx.add_basemap(ax, crs=hourly_data.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
    
    # Customize plot
    plt.title(f'OpenStreetMap Plot for Hour {hour}')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    
    # Save each plot as an image
    plt.savefig(f'combined_plot_{hour}.png')
    plt.close()