In [49]:
import json
import sqlite3
import pandas as pd

In [50]:
JSON_FILE = "data/review.json"        
SQLITE_DB = "data/reviews_sample.db"

In [51]:
records = []

with open(JSON_FILE, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

df = pd.DataFrame(records)
print(f"Loaded {len(df)} raw records")

Loaded 878561 raw records


In [53]:
df.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,"December 17, 2012",147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,"December 17, 2012",147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,"December 18, 2012",147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,"December 17, 2012",147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,"December 17, 2012",147612823,False


In [54]:
df['review_date'] = pd.to_datetime(df['date'], errors='coerce')
max_date = df['review_date'].max()
cutoff_date = max_date - pd.DateOffset(years=5)
df = df[df["review_date"] >= cutoff_date]

print(f"After date filtering: {len(df)} reviews")

After date filtering: 754798 reviews


In [55]:
df['author_id'] = df['author'].apply(lambda x: x.get('id'))
df['author_name'] = df['author'].apply(lambda x: x.get('username'))
df['author_location'] = df['author'].apply(lambda x: x.get('location'))
df['author_num_reviews'] = df["author"].apply(lambda x: x.get("num_reviews"))
df['author_num_cities'] = df["author"].apply(lambda x: x.get("num_cities"))
df['author_num_helpful_votes'] = df["author"].apply(lambda x: x.get("num_helpful_votes"))
df['author_num_type_reviews'] = df["author"].apply(lambda x: x.get("num_type_reviews"))

In [56]:
df["overall"] = df["ratings"].apply(lambda x: x.get("overall"))
df["service"] = df["ratings"].apply(lambda x: x.get("service"))
df["cleanliness"] = df["ratings"].apply(lambda x: x.get("cleanliness"))
df["value"] = df["ratings"].apply(lambda x: x.get("value"))
df["location_rating"] = df["ratings"].apply(lambda x: x.get("location"))
df["sleep_quality"] = df["ratings"].apply(lambda x: x.get("sleep_quality"))
df["rooms"] = df["ratings"].apply(lambda x: x.get("rooms"))

In [57]:
author_df = (df[[
    'author_id',
    'author_name',
    'author_location',
    'author_num_reviews',
    'author_num_cities',
    'author_num_helpful_votes',
    'author_num_type_reviews'
]].drop_duplicates())

In [58]:
hotels_df = (df[[
    'offering_id'
]].drop_duplicates())

In [59]:
reviews_df = (df[[
    'id',
    'author_id',
    'offering_id',
    'overall',
    'service',
    'cleanliness',
    'value',
    'location_rating',
    'sleep_quality',
    'rooms',
    'title',
    'text',
    'review_date',
    'date_stayed',
    'via_mobile',
    'author_num_helpful_votes'
]].drop_duplicates())

In [62]:
conn = sqlite3.connect(SQLITE_DB)
cursor = conn.cursor()

In [71]:
reviews_df.head()

Unnamed: 0,id,author_id,offering_id,overall,service,cleanliness,value,location_rating,sleep_quality,rooms,title,text,review_date,date_stayed,via_mobile,author_num_helpful_votes
0,147643103,8C0B42FF3C0FA366A21CFD785302A032,93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0,"“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,2012-12-17,December 2012,False,12.0
1,147639004,E3C85CA9DBBBC77E0DB534ABE93E4713,93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0,“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...",2012-12-17,December 2012,False,
2,147697954,FB1032DECE1162CB3556D05F278AAFFD,1762573,4.0,4.0,5.0,4.0,5.0,4.0,4.0,“Great Stay”,This is a great property in Midtown. We two di...,2012-12-18,December 2012,False,17.0
3,147625723,EC3E275EE7590694889C8C7EE0D13961,1762573,4.0,5.0,5.0,5.0,5.0,5.0,5.0,“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,2012-12-17,August 2012,False,26.0
4,147612823,BA524A238B1171206691A6CC3F28F266,1762573,4.0,4.0,5.0,3.0,5.0,5.0,5.0,“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,2012-12-17,December 2012,False,65.0


In [84]:
sql = """
DROP TABLE IF EXISTS authors;
DROP TABLE IF EXISTS hotels;
DROP TABLE IF EXISTS reviews;

CREATE TABLE authors (
auhtor_no INTEGER PRIMARY KEY AUTOINCREMENT,
author_id TEXT,
author_name TEXT,
author_location TEXT,
author_num_reviews INTEGER,
author_num_cities INTEGER,
author_num_helpful_votes INTEGER,
author_num_type_reviews INTEGER
);

CREATE TABLE hotels (
offering_id INTEGER PRIMARY KEY
);

CREATE TABLE reviews (
id INTEGER PRIMARY KEY,
author_no INTEGER,
author_id TEXT,
offering_id INTEGER,
overall REAL,
service REAL,
cleanliness REAL,
value REAL,
location_rating REAL,
sleep_quality REAL,
rooms REAL,
title TEXT,
text TEXT,
review_date DATE,
date_stayed TEXT,
via_mobile BOOLEAN,
author_num_helpful_votes INTEGER,

FOREIGN KEY(author_no) REFERENCES authors(author_no),
FOREIGN KEY(offering_id) REFERENCES hotels(offering_id)
);
"""

cursor.executescript(sql)
conn.commit()

In [85]:
author_df.to_sql("authors", conn, if_exists="append", index=False)
hotels_df.to_sql("hotels", conn, if_exists="append", index=False)
reviews_df.to_sql("reviews", conn, if_exists="append", index=False)

print("Data successfully stored in SQLite")

Data successfully stored in SQLite


In [94]:
index_sql = """
CREATE INDEX idx_reviews_date ON reviews(review_date);
CREATE INDEX idx_reviews_user ON reviews(author_no);
CREATE INDEX idx_reviews_hotel ON reviews(offering_id);
"""

cursor.executescript(index_sql)
conn.commit()

conn.close()
print("Indexes created and database finalized")

Indexes created and database finalized


In [95]:
author_df.to_csv("data/authors.csv")
reviews_df.to_csv("data/reviews.csv")
hotels_df.to_csv("data/hotels.csv")