# Libraries

In [8]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import geopandas as gpd
import geopandas as gpd
from shapely.geometry import Point

# From CSV to Parquet (no need to rerun)

In [None]:


# 1. Read your large CSV in chunks (optional, but safer for 14 GB):
reader = pd.read_csv('../data/raw_data/breeze/thermal_drift_data/bos/trackpoints.csv', chunksize=1_000_000)

out_file = '../data/raw_data/breeze/thermal_drift_data/bos/trackpoints.parquet'

writer = None
for chunk in reader:
    table = pa.Table.from_pandas(chunk)
    if writer is None:
        # Create writer with schema from first chunk
        writer = pq.ParquetWriter(out_file,
                                  schema=table.schema,
                                  compression='snappy')
    writer.write_table(table)

# don’t forget to close!
if writer:
    writer.close()


In [3]:
# Read just the first few rows and inspect
df = pd.read_parquet(
    '../data/raw_data/breeze/thermal_drift_data/bos/trackpoints.parquet',
    columns=['tripUuid','time','latitude','longitude','elevation']
)
print(df.head())
print(df.dtypes)
print(f"Total rows: {len(df)}")

                           tripUuid                              time  \
0  032c491ed5233848d66b45715a0cf40d  2014-10-04T19:20:15.143000+00:00   
1  032c491ed5233848d66b45715a0cf40d  2014-10-04T19:20:30.286000+00:00   
2  032c491ed5233848d66b45715a0cf40d  2014-10-04T19:20:45.429000+00:00   
3  032c491ed5233848d66b45715a0cf40d  2014-10-04T19:20:57.714000+00:00   
4  032c491ed5233848d66b45715a0cf40d  2014-10-04T19:20:57.736000+00:00   

    latitude  longitude  elevation  
0  42.357646 -71.058108   0.000000  
1  42.357820 -71.058011   0.000000  
2  42.358055 -71.057927   0.000000  
3  42.358525 -71.058137  23.110687  
4  42.358526 -71.058160  23.101101  
tripUuid      object
time          object
latitude     float64
longitude    float64
elevation    float64
dtype: object
Total rows: 125500600


# Filter trajectories intersecting/ within Back Bay

## Boston Neighbourhoods to Back Bay only

In [9]:
# load the Back Bay polygon
nb = gpd.read_file('../data/raw_data/bos/boston_neighborhood_boundaries.json')\
       .to_crs('EPSG:4326')    # make sure it matches your track CRS
back_bay = nb[nb['name']=='Back Bay']
back_bay.to_file('../data/raw_data/bos/back_bay.json', driver='GEOJSON')

Unnamed: 0,OBJECTID,name,acres,neighborhood_id,sqmiles,Shape_Length,Shape_Area,shape_wkt,geometry
10,11,Back Bay,399.314411,2,0.62,0.063374,0.000177,,"POLYGON ((-71.07569 42.35691, -71.07571 42.356..."


## All trajectories - filtered by Back Bay relation

In [2]:
# 1️⃣ Load your full table (already done):
df = pd.read_parquet(
    '../data/raw_data/breeze/thermal_drift_data/bos/trackpoints.parquet',
    columns=['tripUuid','time','latitude','longitude','elevation']
)

# 2️⃣ Force lat/lon into floats
df['latitude']  = pd.to_numeric(df['latitude'],  errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Drop any rows where conversion failed
df = df.dropna(subset=['latitude','longitude'])

# 3️⃣ Read your Back Bay polygon and get its bounding box
poly = gpd.read_file('../data/raw_data/bos/back_bay.json').to_crs('EPSG:4326')
bounds = poly.total_bounds  
print("total_bounds:", bounds, type(bounds), bounds.dtype)
# e.g. array([ -71.06    ,  42.35    ,  -71.05    ,  42.36    ])
poly = poly.geometry.iloc[0]
minx, miny, maxx, maxy = bounds  # these will be floats
# Just to be safe, you can cast explicitly:
minx, miny, maxx, maxy = map(float, (minx, miny, maxx, maxy))

# 4️⃣ Cheap vectorized bbox cut
df_bbox = df[
    (df.longitude >= minx) & (df.longitude <= maxx) &
    (df.latitude  >= miny) & (df.latitude  <= maxy)
]
print(f"After bbox filter: {len(df_bbox)} points (down from {len(df)})")

# 5️⃣ Precise within-poly filter on only that slice
gdf_bbox = gpd.GeoDataFrame(
    df_bbox,
    geometry=gpd.points_from_xy(df_bbox.longitude, df_bbox.latitude),
    crs='EPSG:4326'
)

inside_mask = gdf_bbox.geometry.within(poly)
gdf_backbay = gdf_bbox.loc[inside_mask].copy()
print(f"{len(gdf_backbay)} points truly inside Back Bay")


# 5️⃣ (Optional) Extract tripUuids and then pull full trajectories
backbay_trip_ids = gdf_backbay['tripUuid'].unique()
df_backbay_full = df[df.tripUuid.isin(backbay_trip_ids)]
print(f"{len(backbay_trip_ids)} trajectories intersect Back Bay, total points: {len(df_backbay_full)}")

# 6️⃣ Save out the subset
df_backbay_full.to_parquet('../data/raw_data/breeze/thermal_drift_data/bos/back_bay_trackpoints.parquet', index=False)

total_bounds: [-71.09166804  42.34152804 -71.06989839  42.35697693] <class 'numpy.ndarray'> float64
After bbox filter: 16635659 points (down from 125500600)
12239655 points truly inside Back Bay
37678 trajectories intersect Back Bay, total points: 23371599


## Trips completely within Back Bay

In [3]:
inside_counts = gdf_backbay.groupby('tripUuid').size().rename('inside_count')

# 4️⃣ Compute total counts of points for those trips
total_counts  = df[df.tripUuid.isin(inside_counts.index)] \
                   .groupby('tripUuid').size() \
                   .rename('total_count')

# 5️⃣ Join and select exclusively‐inside trips
counts = pd.concat([total_counts, inside_counts], axis=1).fillna(0)
exclusive_ids = counts.loc[counts.total_count == counts.inside_count].index.tolist()
df_backbay_exclusive = df[df.tripUuid.isin(exclusive_ids)]
print(f"{len(exclusive_ids)} trajectories are completely within Back Bay, total points: {len(df_backbay_exclusive)}")

df_backbay_exclusive.to_parquet('../data/raw_data/breeze/thermal_drift_data/bos/back_bay_exclusive_trackpoints.parquet', index=False)


16596 trajectories are completely within Back Bay, total points: 5398412


In [4]:
df.head()

Unnamed: 0,tripUuid,time,latitude,longitude,elevation
0,032c491ed5233848d66b45715a0cf40d,2014-10-04T19:20:15.143000+00:00,42.357646,-71.058108,0.0
1,032c491ed5233848d66b45715a0cf40d,2014-10-04T19:20:30.286000+00:00,42.35782,-71.058011,0.0
2,032c491ed5233848d66b45715a0cf40d,2014-10-04T19:20:45.429000+00:00,42.358055,-71.057927,0.0
3,032c491ed5233848d66b45715a0cf40d,2014-10-04T19:20:57.714000+00:00,42.358525,-71.058137,23.110687
4,032c491ed5233848d66b45715a0cf40d,2014-10-04T19:20:57.736000+00:00,42.358526,-71.05816,23.101101


In [9]:
# 1. Read just the time column
df = pd.read_parquet(
    '../data/raw_data/breeze/thermal_drift_data/bos/trackpoints.parquet',
    columns=['time']
)


In [None]:
# 2. Ensure it’s a proper datetime type
df['time'] = pd.to_datetime(df['time'], 
                            utc=True, 
                            errors='coerce', # turn unparsables into NaT
                            )


# 3. Inspect how many failed
n_invalid = times['time_parsed'].isna().sum()
print(f"{n_invalid} rows failed to parse and became NaT")

# 4. Drop the bad ones before analysis
valid_times = times['time_parsed'].dropna()

# 5. Find the overall range
min_time = df['time'].min()
max_time = df['time'].max()
print(f"Data spans from {min_time} to {max_time}")

# 6. Which years are present?
years = df['time'].dt.year.unique()
print("Years in dataset:", sorted(years))

# 7. (Optional) Count per year
year_counts = df['time'].dt.year.value_counts().sort_index()
print("\nCounts per year:\n", year_counts)

NameError: name 'times' is not defined

In [11]:
# 3. Inspect how many failed
n_invalid = df['time'].isna().sum()
print(f"{n_invalid} rows failed to parse and became NaT")

# 4. Drop the bad ones before analysis
valid_times = df['time'].dropna()

# 5. Find the overall range
min_time = df['time'].min()
max_time = df['time'].max()
print(f"Data spans from {min_time} to {max_time}")

# 6. Which years are present?
years = df['time'].dt.year.unique()
print("Years in dataset:", sorted(years))

# 7. (Optional) Count per year
year_counts = df['time'].dt.year.value_counts().sort_index()
print("\nCounts per year:\n", year_counts)

69307 rows failed to parse and became NaT
Data spans from 2014-05-19 15:55:22.861000+00:00 to 2015-05-05 00:18:23.291000+00:00
Years in dataset: [np.float64(2014.0), np.float64(2015.0), np.float64(nan)]

Counts per year:
 time
2014.0    80123583
2015.0    45307710
Name: count, dtype: int64
