# Dataset Exploration: Food-Only Georgia Restaurants

This notebook explores the processed Parquet datasets:
- `biz_ga.parquet` - Food businesses in Georgia
- `reviews_ga.parquet` - Reviews for food businesses
- `user_sequences_ga.parquet` - User visit sequences
- `pairs_ga.parquet` - Consecutive visit pairs

In [33]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import numpy as np

# Set up paths
base_dir = Path.cwd().parent
data_dir = base_dir / 'data' / 'processed' / 'ga'

print("✓ Imports successful")
print(f"Data directory: {data_dir}")

✓ Imports successful
Data directory: /Users/istantheman/Forkast/data/processed/ga


## 1. Business Data (`biz_ga.parquet`)

Food-only businesses in Georgia with normalized categories.

In [34]:
# Load business data
biz_df = pl.read_parquet(data_dir / 'biz_ga.parquet')

print(f"Total businesses: {len(biz_df):,}")
print(f"\nSchema:")
print(biz_df.schema)
print(f"\nFirst 5 rows:")
biz_df.head()

Total businesses: 27,710

Schema:
Schema({'gmap_id': String, 'name': String, 'lat': Float32, 'lon': Float32, 'category_main': String, 'category_all': List(String), 'avg_rating': Float32, 'num_reviews': Int32, 'price_bucket': Int8, 'is_closed': Boolean, 'relative_results': List(String)})

First 5 rows:


gmap_id,name,lat,lon,category_main,category_all,avg_rating,num_reviews,price_bucket,is_closed,relative_results
str,str,f32,f32,str,list[str],f32,i32,i8,bool,list[str]
"""0x88f9d2c00c43480b:0x4e2b2546e…","""Circle K""",33.521141,-82.062691,"""coffee""","[""other"", ""coffee""]",3.0,6,1.0,False,"[""0x88f9d2c00c639557:0x203210cfbe61539b"", ""0x88f9d1af6f8d44c1:0x292c8b9b748acb84"", … ""0x88f9d25aa8b108bd:0xf67d3e730aff6d47""]"
"""0x888b57f0dde8f1ed:0xb1dfe7591…","""Bull hibachi 2""",33.031227,-85.056335,"""sushi""","[""sushi"", ""asian""]",4.7,86,,True,"[""0x888b5524187569db:0x188aae4569966c51"", ""0x888b569baac8d037:0x1bba6f11b8e74341"", … ""0x888ca40b7f255eb1:0x3df3f7f41f9be385""]"
"""0x88f5455bb858bd7b:0x511289693…","""Shoney's""",34.204128,-84.766693,"""burger""","[""american"", ""breakfast"", … ""restaurant""]",3.7,1278,1.0,False,"[""0x88f5455b223d26a7:0x586b1cd28c6a4ede"", ""0x88f54f70b010bf73:0x289d39ea3dd27f90"", … ""0x88f54f7752f4497f:0x1ac3376e7db04a9d""]"
"""0x88f5a18b426ebb41:0x29ed65565…","""CMX CinéBistro Peachtree Corne…",33.978939,-84.213974,"""bar""","[""other"", ""bar"", ""restaurant""]",4.2,408,,False,"[""0x88f50a9bf50e45ad:0x737268d3af7ada4b"", ""0x88f5a7e04ca4d117:0xd461e9e5ffb5a3fd"", … ""0x88f5751666d7caa1:0x2ed992fc71c424d9""]"
"""0x88f4f94de304903b:0xbc0a671f1…","""Subway""",33.572979,-84.342773,"""fast_food""","[""fast_food"", ""restaurant"", ""other""]",3.6,94,1.0,True,"[""0x88f4fdd702b38e75:0x175777d2021cf484"", ""0x88f4fbfdc0368501:0x27c4c3a6cdaace48"", … ""0x88f4feb137f4bb21:0x82ed888b45162774""]"


In [35]:
# Business statistics
print("BUSINESS STATISTICS")
print("=" * 60)
print(f"Total businesses: {len(biz_df):,}")
print(f"Closed businesses: {biz_df['is_closed'].sum():,}")
print(f"Average rating: {biz_df['avg_rating'].mean():.2f}")
print(f"Total reviews (from metadata): {biz_df['num_reviews'].sum():,}")
print(f"\nPrice distribution:")
price_dist = biz_df.group_by('price_bucket').agg(pl.len().alias('count')).sort('price_bucket')
for row in price_dist.iter_rows():
    # Handle None/null values
    if row[0] is None or row[0] == 0:
        price_label = 'Unknown'
    else:
        price_label = '$' * row[0]
    print(f"  {price_label:10s}: {row[1]:,}")

BUSINESS STATISTICS
Total businesses: 27,710
Closed businesses: 12,044
Average rating: 4.13
Total reviews (from metadata): 10,500,219

Price distribution:
  Unknown   : 8,481
  $         : 12,035
  $$        : 6,901
  $$$       : 252
  $$$$      : 41


In [36]:
# Category distribution
cat_dist = biz_df.group_by('category_main').agg(pl.len().alias('count')).sort('count', descending=True)
cat_dist_pd = cat_dist.to_pandas()

# Bar chart
fig = px.bar(cat_dist_pd, x='category_main', y='count',
             title='Business Category Distribution (All 24 Food Categories)',
             labels={'category_main': 'Category', 'count': 'Number of Businesses'})
fig.update_xaxes(tickangle=45)
fig.show()

# Print top 15
print("\nTop 15 Categories:")
for i, row in enumerate(cat_dist.head(15).iter_rows(), 1):
    pct = row[1] / len(biz_df) * 100
    print(f"{i:2d}. {row[0]:20s}: {row[1]:6,} ({pct:5.1f}%)")


Top 15 Categories:
 1. restaurant          :  4,490 ( 16.2%)
 2. fast_food           :  3,396 ( 12.3%)
 3. american            :  2,564 (  9.3%)
 4. mexican             :  2,148 (  7.8%)
 5. pizza               :  2,124 (  7.7%)
 6. burger              :  2,066 (  7.5%)
 7. breakfast           :  1,260 (  4.5%)
 8. bar                 :  1,110 (  4.0%)
 9. seafood             :  1,004 (  3.6%)
10. chinese             :    994 (  3.6%)
11. bbq                 :    890 (  3.2%)
12. bakery              :    858 (  3.1%)
13. asian               :    782 (  2.8%)
14. coffee              :    732 (  2.6%)
15. sushi               :    730 (  2.6%)


In [37]:
# Geographic distribution (sample for performance)
sample_size = min(5000, len(biz_df))
biz_sample = biz_df.sample(n=sample_size).to_pandas()

fig = px.scatter_mapbox(
    biz_sample, 
    lat='lat', 
    lon='lon',
    color='category_main',
    hover_name='name',
    hover_data=['avg_rating', 'num_reviews'],
    title=f'Business Locations in Georgia (Sample of {sample_size:,})',
    zoom=6,
    height=600
)
fig.update_layout(mapbox_style='open-street-map')
fig.show()


*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



## 2. Reviews Data (`reviews_ga.parquet`)

All reviews for food businesses only.

In [38]:
# Load reviews (sample for speed)
reviews_df = pl.read_parquet(data_dir / 'reviews_ga.parquet')

print(f"Total reviews: {len(reviews_df):,}")
print(f"\nSchema:")
print(reviews_df.schema)
print(f"\nFirst 5 rows:")
reviews_df.head(5)

Total reviews: 10,339,035

Schema:
Schema({'user_id': String, 'gmap_id': String, 'ts': Datetime(time_unit='us', time_zone=None), 'rating': Int8, 'text': String, 'has_pics': Boolean, 'has_resp': Boolean})

First 5 rows:


user_id,gmap_id,ts,rating,text,has_pics,has_resp
str,str,datetime[μs],i8,str,bool,bool
"""114375629507315554651""","""0x88f5a393fc16aa3b:0xfbed02608…",2016-11-11 21:13:56,5,,False,False
"""114367894567086897658""","""0x88f4e39287d30e4d:0x3e9a8ef8e…",2018-07-06 07:31:33,5,"""Food and customer service was …",False,True
"""106903601813517640261""","""0x88f27c4586ef0f9f:0x4db24ff2f…",2019-11-26 06:55:11,5,"""Love love love chick fil a""",False,False
"""110284863507020939770""","""0x885ffe6a3a61f22b:0xb39fa62d2…",2018-12-01 03:24:08,4,,False,False
"""107745123286042334379""","""0x88f504ebf04de0f3:0x273c842ec…",2016-08-22 19:25:15,5,"""Amazing variety, kind people, …",True,False


In [51]:
# Review statistics
print("REVIEW STATISTICS")
print("=" * 60)
print(f"Total reviews: {len(reviews_df):,}")
print(f"Unique users: {reviews_df['user_id'].n_unique():,}")
print(f"Unique businesses: {reviews_df['gmap_id'].n_unique():,}")
print(f"Reviews with pics: {reviews_df['has_pics'].sum():,}")
print(f"Reviews with responses: {reviews_df['has_resp'].sum():,}")
print(f"Average rating: {reviews_df['rating'].mean():.2f}")
print(f"\nTemporal range:")
print(f"  Earliest: {reviews_df['ts'].min()}")
print(f"  Latest: {reviews_df['ts'].max()}")

REVIEW STATISTICS
Total reviews: 10,339,035
Unique users: 2,546,362
Unique businesses: 27,710
Reviews with pics: 268,688
Reviews with responses: 1,152,839
Average rating: 4.11

Temporal range:
  Earliest: 2001-01-06 00:00:00
  Latest: 2021-09-08 01:43:37


In [40]:
# Rating distribution
rating_dist = reviews_df.group_by('rating').agg(pl.len().alias('count')).sort('rating')
rating_dist_pd = rating_dist.to_pandas()

fig = px.bar(rating_dist_pd, x='rating', y='count',
             title='Review Rating Distribution',
             labels={'rating': 'Rating (1-5 stars)', 'count': 'Number of Reviews'})
fig.show()

# Print distribution
print("Rating Distribution:")
for row in rating_dist.iter_rows():
    pct = row[1] / len(reviews_df) * 100
    print(f"  {row[0]} stars: {row[1]:,} ({pct:.1f}%)")

Rating Distribution:
  1 stars: 907,657 (8.8%)
  2 stars: 468,694 (4.5%)
  3 stars: 1,009,706 (9.8%)
  4 stars: 2,141,839 (20.7%)
  5 stars: 5,811,139 (56.2%)


In [41]:
# Temporal distribution
reviews_by_year = reviews_df.with_columns([
    pl.col('ts').dt.year().alias('year')
]).group_by('year').agg(pl.len().alias('count')).sort('year')

reviews_by_year_pd = reviews_by_year.to_pandas()

fig = px.line(reviews_by_year_pd, x='year', y='count',
              title='Reviews Over Time',
              labels={'year': 'Year', 'count': 'Number of Reviews'},
              markers=True)
fig.show()

## 3. User Sequences (`user_sequences_ga.parquet`)

Individual visits in chronological order for each user.

In [52]:
# Load sequences
sequences_df = pl.read_parquet(data_dir / 'user_sequences_ga.parquet')

print(f"Total visits: {len(sequences_df):,}")
print(f"\nSchema:")
print(sequences_df.schema)
print(f"\nFirst 10 rows (showing a user's journey):")
sequences_df.head(10)

Total visits: 10,339,035

Schema:
Schema({'user_id': String, 'seq_idx': UInt32, 'gmap_id': String, 'ts': Datetime(time_unit='us', time_zone=None), 'category_main': String, 'lat': Float32, 'lon': Float32, 'rating': Int8})

First 10 rows (showing a user's journey):


user_id,seq_idx,gmap_id,ts,category_main,lat,lon,rating
str,u32,str,datetime[μs],str,f32,f32,i8
"""100000007134886560887""",1,"""0x88f5758473a8bc37:0x9367635ab…",2019-05-14 02:00:43,"""burger""",34.070248,-84.276176,5
"""100000007134886560887""",2,"""0x88f5759cc6eaf6a9:0x16a87fcb6…",2019-05-14 02:01:53,"""restaurant""",34.068447,-84.281174,5
"""100000007134886560887""",3,"""0x88f59b65e7b2e153:0x208b4ad7b…",2019-05-14 02:02:41,"""mexican""",34.164898,-84.176422,5
"""100000020958895295779""",1,"""0x886069e02d76edc3:0x3edd94721…",2019-01-18 08:54:52,"""burger""",34.882252,-85.271133,4
"""100000020958895295779""",2,"""0x886068a3bba52f37:0x7a7c672c1…",2019-01-18 08:55:31,"""burger""",34.936829,-85.207268,5
"""100000020958895295779""",3,"""0x8860662a73537617:0x555a12533…",2019-01-18 08:57:22,"""burger""",34.945683,-85.226974,5
"""100000020958895295779""",4,"""0x88606899517f7c39:0x33b774df5…",2019-06-13 16:46:52,"""mexican""",34.938271,-85.214363,4
"""100000020958895295779""",5,"""0x88f4e9c68ee5c59d:0x1e1d13eba…",2019-06-13 16:51:20,"""steakhouse""",33.440693,-84.589668,5
"""100000020958895295779""",6,"""0x88f540feacdfed3f:0x18f90c679…",2019-06-13 16:53:29,"""burger""",34.076595,-84.653023,5
"""100000020958895295779""",7,"""0x88606a5a9dc69973:0x542f5d2fc…",2019-07-29 11:50:08,"""fast_food""",34.823025,-85.241394,5


In [43]:
# Sequence statistics
seq_lengths = sequences_df.group_by('user_id').agg(pl.len().alias('seq_length'))

print("SEQUENCE STATISTICS")
print("=" * 60)
print(f"Total visits: {len(sequences_df):,}")
print(f"Unique users: {sequences_df['user_id'].n_unique():,}")
print(f"Unique businesses: {sequences_df['gmap_id'].n_unique():,}")
print(f"\nSequence length distribution:")
print(f"  Mean: {seq_lengths['seq_length'].mean():.1f}")
print(f"  Median: {seq_lengths['seq_length'].median():.0f}")
print(f"  Max: {seq_lengths['seq_length'].max()}")
print(f"  Users with 2+ visits: {(seq_lengths['seq_length'] > 1).sum():,}")

SEQUENCE STATISTICS
Total visits: 10,339,035
Unique users: 2,546,362
Unique businesses: 27,710

Sequence length distribution:
  Mean: 4.1
  Median: 1
  Max: 648
  Users with 2+ visits: 1,135,876


In [58]:
# Show example user journeys
print("EXAMPLE USER JOURNEYS")
print("=" * 80)

# Find users with 5-10 visits
users_with_visits = seq_lengths.filter(
    (pl.col('seq_length') >= 5)
).head(3)

# users_with_visits = sequences_df.head(3)

for user_id in users_with_visits['user_id'].to_list():
    user_seq = sequences_df.filter(pl.col('user_id') == user_id).sort('seq_idx')
    print(f"\nUser: {user_id}")
    print(f"Total visits: {len(user_seq)}")
    print("Journey:")
    for row in user_seq.iter_rows(named=True):
        print(f"  {row['seq_idx']}. {row['ts'].strftime('%Y-%m-%d %H:%M')} - "
              f"{row['category_main']:15s} (rating: {row['rating']})")
    print("-" * 80)

EXAMPLE USER JOURNEYS

User: 107276180193161971513
Total visits: 31
Journey:
  1. 2018-07-22 15:15 - mexican         (rating: 4)
  2. 2018-09-19 03:44 - burger          (rating: 3)
  3. 2018-12-19 03:34 - restaurant      (rating: 1)
  4. 2019-01-13 23:49 - burger          (rating: 2)
  5. 2019-01-13 23:49 - american        (rating: 5)
  6. 2019-01-13 23:49 - seafood         (rating: 3)
  7. 2019-01-13 23:49 - american        (rating: 3)
  8. 2019-01-13 23:50 - fast_food       (rating: 4)
  9. 2019-08-23 00:07 - bbq             (rating: 4)
  10. 2019-08-23 00:08 - american        (rating: 3)
  11. 2019-08-23 00:08 - restaurant      (rating: 4)
  12. 2019-08-23 00:08 - seafood         (rating: 4)
  13. 2019-08-23 00:08 - seafood         (rating: 5)
  14. 2019-08-23 00:09 - seafood         (rating: 5)
  15. 2019-08-23 00:09 - steakhouse      (rating: 4)
  16. 2019-08-23 00:10 - restaurant      (rating: 4)
  17. 2019-08-23 00:10 - mexican         (rating: 4)
  18. 2019-08-23 00:10 - bbq   

## 4. Consecutive Pairs (`pairs_ga.parquet`)

Visit transitions (src → dst) within 7-day window.

In [45]:
# Load pairs
pairs_df = pl.read_parquet(data_dir / 'pairs_ga.parquet')

print(f"Total pairs: {len(pairs_df):,}")
print(f"\nSchema:")
print(pairs_df.schema)
print(f"\nFirst 5 pairs:")
pairs_df.head()

Total pairs: 4,152,155

Schema:
Schema({'user_id': String, 'src_gmap_id': String, 'dst_gmap_id': String, 'src_ts': Datetime(time_unit='us', time_zone=None), 'dst_ts': Datetime(time_unit='us', time_zone=None), 'delta_hours': Float64, 'src_category_main': String, 'dst_category_main': String, 'src_lat': Float32, 'src_lon': Float32, 'dst_lat': Float32, 'dst_lon': Float32, 'src_rating': Int8, 'dst_rating': Int8})

First 5 pairs:


user_id,src_gmap_id,dst_gmap_id,src_ts,dst_ts,delta_hours,src_category_main,dst_category_main,src_lat,src_lon,dst_lat,dst_lon,src_rating,dst_rating
str,str,str,datetime[μs],datetime[μs],f64,str,str,f32,f32,f32,f32,i8,i8
"""100000007134886560887""","""0x88f5758473a8bc37:0x9367635ab…","""0x88f5759cc6eaf6a9:0x16a87fcb6…",2019-05-14 02:00:43,2019-05-14 02:01:53,0.019444,"""burger""","""restaurant""",34.070248,-84.276176,34.068447,-84.281174,5,5
"""100000007134886560887""","""0x88f5759cc6eaf6a9:0x16a87fcb6…","""0x88f59b65e7b2e153:0x208b4ad7b…",2019-05-14 02:01:53,2019-05-14 02:02:41,0.013333,"""restaurant""","""mexican""",34.068447,-84.281174,34.164898,-84.176422,5,5
"""100000020958895295779""","""0x886069e02d76edc3:0x3edd94721…","""0x886068a3bba52f37:0x7a7c672c1…",2019-01-18 08:54:52,2019-01-18 08:55:31,0.010833,"""burger""","""burger""",34.882252,-85.271133,34.936829,-85.207268,4,5
"""100000020958895295779""","""0x886068a3bba52f37:0x7a7c672c1…","""0x8860662a73537617:0x555a12533…",2019-01-18 08:55:31,2019-01-18 08:57:22,0.030833,"""burger""","""burger""",34.936829,-85.207268,34.945683,-85.226974,5,5
"""100000020958895295779""","""0x88606899517f7c39:0x33b774df5…","""0x88f4e9c68ee5c59d:0x1e1d13eba…",2019-06-13 16:46:52,2019-06-13 16:51:20,0.074444,"""mexican""","""steakhouse""",34.938271,-85.214363,33.440693,-84.589668,4,5


In [46]:
# Pair statistics
print("PAIR STATISTICS")
print("=" * 60)
print(f"Total pairs: {len(pairs_df):,}")
print(f"Unique users: {pairs_df['user_id'].n_unique():,}")
print(f"Unique source businesses: {pairs_df['src_gmap_id'].n_unique():,}")
print(f"Unique destination businesses: {pairs_df['dst_gmap_id'].n_unique():,}")
print(f"\nTime delta (hours):")
print(f"  Mean: {pairs_df['delta_hours'].mean():.1f}")
print(f"  Median: {pairs_df['delta_hours'].median():.1f}")
print(f"  Min: {pairs_df['delta_hours'].min():.2f}")
print(f"  Max: {pairs_df['delta_hours'].max():.1f}")

PAIR STATISTICS
Total pairs: 4,152,155
Unique users: 744,680
Unique source businesses: 26,893
Unique destination businesses: 26,891

Time delta (hours):
  Mean: 16.5
  Median: 0.0
  Min: 0.00
  Max: 168.0


In [59]:
# Time delta distribution
time_bins = pairs_df.with_columns([
    pl.when(pl.col('delta_hours') < 1).then(pl.lit('< 1 hour'))
      .when(pl.col('delta_hours') < 6).then(pl.lit('1-6 hours'))
      .when(pl.col('delta_hours') < 24).then(pl.lit('6-24 hours'))
      .when(pl.col('delta_hours') < 72).then(pl.lit('1-3 days'))
      .otherwise(pl.lit('3-7 days'))
      .alias('time_bin')
]).group_by('time_bin').agg(pl.len().alias('count'))

print("\nTime gap distribution:")
for row in time_bins.iter_rows():
    pct = row[1] / len(pairs_df) * 100
    print(f"  {row[0]:15s}: {row[1]:,} ({pct:.1f}%)")


Time gap distribution:
  < 1 hour       : 3,130,231 (75.4%)
  1-3 days       : 376,796 (9.1%)
  3-7 days       : 424,531 (10.2%)
  1-6 hours      : 69,335 (1.7%)
  6-24 hours     : 151,262 (3.6%)


In [60]:
# Category transitions
cat_trans = pairs_df.group_by(['src_category_main', 'dst_category_main']).agg(
    pl.len().alias('count')
).sort('count', descending=True)

print("\nTop 20 Category Transitions:")
print("=" * 60)
for i, row in enumerate(cat_trans.head(20).iter_rows(), 1):
    print(f"{i:2d}. {row[0]:15s} → {row[1]:15s}: {row[2]:,}")


Top 20 Category Transitions:
 1. burger          → burger         : 159,040
 2. burger          → american       : 102,698
 3. american        → american       : 102,406
 4. american        → burger         : 101,939
 5. burger          → fast_food      : 94,720
 6. fast_food       → burger         : 76,093
 7. fast_food       → fast_food      : 72,134
 8. burger          → mexican        : 63,428
 9. mexican         → burger         : 63,043
10. american        → fast_food      : 59,158
11. mexican         → american       : 55,285
12. american        → mexican        : 54,902
13. burger          → breakfast      : 50,912
14. fast_food       → american       : 49,923
15. american        → restaurant     : 45,145
16. breakfast       → burger         : 44,436
17. american        → breakfast      : 44,154
18. seafood         → burger         : 42,552
19. burger          → seafood        : 42,450
20. mexican         → mexican        : 42,015


In [49]:
# Create transition matrix for top categories
top_cats = cat_dist.head(10)['category_main'].to_list()

# Filter to top categories
top_trans = cat_trans.filter(
    pl.col('src_category_main').is_in(top_cats) &
    pl.col('dst_category_main').is_in(top_cats)
)

# Create matrix
matrix = np.zeros((len(top_cats), len(top_cats)))
for row in top_trans.iter_rows():
    src_idx = top_cats.index(row[0])
    dst_idx = top_cats.index(row[1])
    matrix[src_idx, dst_idx] = row[2]

# Plot heatmap
fig = go.Figure(data=go.Heatmap(
    z=matrix,
    x=top_cats,
    y=top_cats,
    colorscale='Blues',
    text=matrix.astype(int),
    texttemplate='%{text}',
    textfont={"size": 10}
))

fig.update_layout(
    title='Category Transition Heatmap (Top 10 Categories)',
    xaxis_title='Destination Category',
    yaxis_title='Source Category',
    height=600
)
fig.show()

## 5. Summary & Insights

In [50]:
print("=" * 80)
print("DATASET SUMMARY")
print("=" * 80)
print(f"\n📊 BUSINESSES: {len(biz_df):,}")
print(f"   - All food-related (0 'other' category)")
print(f"   - Top category: {cat_dist[0, 'category_main']} ({cat_dist[0, 'count']:,})")
print(f"   - Average rating: {biz_df['avg_rating'].mean():.2f}/5.0")

print(f"\n📝 REVIEWS: {len(reviews_df):,}")
print(f"   - From {reviews_df['user_id'].n_unique():,} unique users")
print(f"   - Average rating: {reviews_df['rating'].mean():.2f}/5.0")
print(f"   - Time span: 2001-2021 (20 years)")

print(f"\n👥 USER SEQUENCES: {len(sequences_df):,} visits")
print(f"   - {sequences_df['user_id'].n_unique():,} users")
print(f"   - Average {seq_lengths['seq_length'].mean():.1f} visits per user")
print(f"   - {(seq_lengths['seq_length'] > 1).sum():,} users with multiple visits")

print(f"\n🔗 CONSECUTIVE PAIRS: {len(pairs_df):,}")
print(f"   - All food-to-food transitions")
print(f"   - Average time gap: {pairs_df['delta_hours'].mean():.1f} hours")
print(f"   - Top transition: {cat_trans[0, 'src_category_main']} → {cat_trans[0, 'dst_category_main']} ({cat_trans[0, 'count']:,})")

print(f"\n✅ READY FOR:")
print(f"   - Phase A3: Feature Engineering")
print(f"   - Phase B1: XGBoost Training")
print(f"   - Phase B2: LSTM Training")
print("=" * 80)

DATASET SUMMARY

📊 BUSINESSES: 27,710
   - All food-related (0 'other' category)
   - Top category: restaurant (4,490)
   - Average rating: 4.13/5.0

📝 REVIEWS: 10,339,035
   - From 2,546,362 unique users
   - Average rating: 4.11/5.0
   - Time span: 2001-2021 (20 years)

👥 USER SEQUENCES: 10,339,035 visits
   - 2,546,362 users
   - Average 4.1 visits per user
   - 1,135,876 users with multiple visits

🔗 CONSECUTIVE PAIRS: 4,152,155
   - All food-to-food transitions
   - Average time gap: 16.5 hours
   - Top transition: burger → burger (159,040)

✅ READY FOR:
   - Phase A3: Feature Engineering
   - Phase B1: XGBoost Training
   - Phase B2: LSTM Training
