In [73]:
import json
import sqlite3
import pandas as pd

In [93]:
# original
# JSON_FILE = "data/review.json"        
# SQLITE_DB = "data/reviews_sample.db"

ASSIGNMENT_ROOT = "../student-name-hotel-analytics"
DATA_DIR = ASSIGNMENT_ROOT + "/data/"
JSON_FILE = DATA_DIR + "review.json"        
SQLITE_DB = DATA_DIR + "reviews_sample.db"
SQL_SCHEMA = DATA_DIR + "data_schema.sql"
SQL_INDEXING = DATA_DIR + "db_indexing.sql"



In [94]:
records = []

with open(JSON_FILE, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

df = pd.DataFrame(records)
print(f"Loaded {len(df)} raw records")



Loaded 878561 raw records


In [95]:
df.head()
print(df.columns)
print(df["date"].max(), df["date"].min())
print(df["date"].dtype)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth",None)
print(df["ratings"].head(1))
print(df["title"].head(1))
print(df["text"].head(1))
print(df["author"].head(1))
print(df["date_stayed"].head(1))
print(df["offering_id"].head(1))
print(df["num_helpful_votes"].head(1))
print(df["id"].head(1))

Index(['ratings', 'title', 'text', 'author', 'date_stayed', 'offering_id',
       'num_helpful_votes', 'date', 'id', 'via_mobile'],
      dtype='object')
September 9, 2012 April 1, 2002
object
0    {'service': 5.0, 'cleanliness': 5.0, 'overall': 5.0, 'value': 5.0, 'location': 5.0, 'sleep_quality': 5.0, 'rooms': 5.0}
Name: ratings, dtype: object
0    “Truly is "Jewel of the Upper Wets Side"”
Name: title, dtype: object
0    Stayed in a king suite for 11 nights and yes it cots us a bit but we were happy with the standard of room, the location and the friendliness of the staff. Our room was on the 20th floor overlooking Broadway and the madhouse of the Fairway Market. Room was quite with no noise evident from the hallway or adjoining rooms. It was great to be able to open windows when we craved fresh rather than heated air. The beds, including the fold out sofa bed, were comfortable and the rooms were cleaned well. Wi-fi access worked like a dream with only one connectivity issue on our fi

In [96]:
df['review_date'] = pd.to_datetime(df['date'], errors='coerce')
max_date = df['review_date'].max()
cutoff_date = max_date - pd.DateOffset(years=5)
df = df[df["review_date"] >= cutoff_date]
print(f"Earliest date: {df['review_date'].min()}")
print(f"Latest date: {df['review_date'].max()}")
print(f"Window = {(df['review_date'].max() - df['review_date'].min()).days / 365:.2f} years")
print(f"After date filtering: {len(df)} reviews")
print(df.shape)

Earliest date: 2007-12-20 00:00:00
Latest date: 2012-12-20 00:00:00
Window = 5.01 years
After date filtering: 754798 reviews
(754798, 11)


In [97]:
df['author_id'] = df['author'].apply(lambda x: x.get('id'))
df['author_name'] = df['author'].apply(lambda x: x.get('username'))
df['author_location'] = df['author'].apply(lambda x: x.get('location'))
df['author_num_reviews'] = df["author"].apply(lambda x: x.get("num_reviews"))
df['author_num_cities'] = df["author"].apply(lambda x: x.get("num_cities"))
df['author_num_helpful_votes'] = df["author"].apply(lambda x: x.get("num_helpful_votes"))
df['author_num_type_reviews'] = df["author"].apply(lambda x: x.get("num_type_reviews"))
df.columns

Index(['ratings', 'title', 'text', 'author', 'date_stayed', 'offering_id',
       'num_helpful_votes', 'date', 'id', 'via_mobile', 'review_date',
       'author_id', 'author_name', 'author_location', 'author_num_reviews',
       'author_num_cities', 'author_num_helpful_votes',
       'author_num_type_reviews'],
      dtype='object')

In [98]:
df["overall"] = df["ratings"].apply(lambda x: x.get("overall"))
df["service"] = df["ratings"].apply(lambda x: x.get("service"))
df["cleanliness"] = df["ratings"].apply(lambda x: x.get("cleanliness"))
df["value"] = df["ratings"].apply(lambda x: x.get("value"))
df["location_rating"] = df["ratings"].apply(lambda x: x.get("location"))
df["sleep_quality"] = df["ratings"].apply(lambda x: x.get("sleep_quality"))
df["rooms"] = df["ratings"].apply(lambda x: x.get("rooms"))
df.columns

Index(['ratings', 'title', 'text', 'author', 'date_stayed', 'offering_id',
       'num_helpful_votes', 'date', 'id', 'via_mobile', 'review_date',
       'author_id', 'author_name', 'author_location', 'author_num_reviews',
       'author_num_cities', 'author_num_helpful_votes',
       'author_num_type_reviews', 'overall', 'service', 'cleanliness', 'value',
       'location_rating', 'sleep_quality', 'rooms'],
      dtype='object')

In [99]:
### for sample date, we only need 80k records, so we can drop the rest
df = df.dropna()
df.shape
# df['author_num_type_reviews'].isna().sum()

(266164, 25)

In [100]:
### Sameple 50k records for faster processing in the next steps
df = df.sample(n=50000, random_state=42).reset_index(drop=True)

In [108]:
author_df = (df[[
    'author_id',
    'author_name',
    'author_location',
    'author_num_reviews',
    'author_num_cities',
    'author_num_helpful_votes',
    'author_num_type_reviews'
]].drop_duplicates())

In [109]:
hotels_df = (df[[
    'offering_id'
]].drop_duplicates())

In [110]:
reviews_df = (df[[
    'id',
    'author_id',
    'offering_id',
    'overall',
    'service',
    'cleanliness',
    'value',
    'location_rating',
    'sleep_quality',
    'rooms',
    'title',
    'text',
    'review_date',
    'date_stayed',
    'via_mobile',
    'author_num_helpful_votes'
]].drop_duplicates())
# print(df.columns.tolist())

In [111]:
conn = sqlite3.connect(SQLITE_DB)
cursor = conn.cursor()

In [112]:
reviews_df.head(1)

Unnamed: 0,id,author_id,offering_id,overall,service,cleanliness,value,location_rating,sleep_quality,rooms,title,text,review_date,date_stayed,via_mobile,author_num_helpful_votes
0,118369083,815DCAB879EAC7EA69C264CA468AD791,123036,5.0,5.0,5.0,5.0,5.0,5.0,5.0,“Nice hotel/convienent location”,"The hotel is undergoing a lobby upgrade. Construction was present, but quiet. The rooms are freshly remodeled. Breakfast was good, above normal Hampton standard. Location is right great being right next to the Seattle Center and walking distance to downtown.",2011-09-20,September 2011,False,9.0


In [113]:
with open(SQL_SCHEMA, "r", encoding="utf-8") as f:
    sql = f.read()

cursor.executescript(sql)
conn.commit()

In [114]:
author_df.to_sql("authors", conn, if_exists="replace", index=False)
hotels_df.to_sql("hotels", conn, if_exists="replace", index=False)
reviews_df.to_sql("reviews", conn, if_exists="replace", index=False)

print("Data successfully stored in SQLite")

Data successfully stored in SQLite


In [115]:
with open(SQL_INDEXING, 'r', encoding="utf-8") as f:
    index_sql = f.read()

cursor.executescript(index_sql)
conn.commit()

conn.close()
print("Indexes created and database finalized")
# df.columns.tolist()

Indexes created and database finalized


In [90]:
# df.drop(columns=["author", "ratings"], inplace=True)
# author_df.to_csv(f"{DATA_DIR}/authors.csv")
# reviews_df.to_csv(f"{DATA_DIR}/reviews.csv")
# hotels_df.to_csv(f"{DATA_DIR}/hotels.csv")
# df.to_csv(f"{DATA_DIR}/consolidated_data.csv", index=False)

In [116]:
df.drop(columns=["author", "ratings"], inplace=True)
print(df.columns.tolist())
df.head(1)

['title', 'text', 'date_stayed', 'offering_id', 'num_helpful_votes', 'date', 'id', 'via_mobile', 'review_date', 'author_id', 'author_name', 'author_location', 'author_num_reviews', 'author_num_cities', 'author_num_helpful_votes', 'author_num_type_reviews', 'overall', 'service', 'cleanliness', 'value', 'location_rating', 'sleep_quality', 'rooms']


Unnamed: 0,title,text,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile,review_date,author_id,...,author_num_cities,author_num_helpful_votes,author_num_type_reviews,overall,service,cleanliness,value,location_rating,sleep_quality,rooms
0,“Nice hotel/convienent location”,"The hotel is undergoing a lobby upgrade. Construction was present, but quiet. The rooms are freshly remodeled. Breakfast was good, above normal Hampton standard. Location is right great being right next to the Seattle Center and walking distance to downtown.",September 2011,123036,0,"September 20, 2011",118369083,False,2011-09-20,815DCAB879EAC7EA69C264CA468AD791,...,11.0,9.0,10.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


### Create Variable Dataframe For Subsequent EDA

In [117]:

df_variables = pd.DataFrame({
    "name": df.columns,
    "type": pd.Series(['string', 'string', 'datetime', 'Int64', 'Int64', 'datetime', 'Int64', 'boolean', 'datetime', 'string', 'string', 'string', 'Int64', 'Int64', 'Int64', 'Int64', 'Int64', 'Int64', 'Int64', 'Int64', 'Int64', 'Int64', 'Int64']),
    "role": pd.Series(['feature', 'feature', 'feature', '', 'feature', 'feature', '', 'feature', 'feature', '', '', 'feature', 'feature', 'feature', 'feature', 'feature', 'target', 'feature', 'feature', 'feature', 'feature', 'feature', 'feature']),
    "description": pd.Series([
        "Title of the review",                                  # title
        "Text content of the review",                           # text
        "Date when the author stayed at the hotel",             # date_stayed
        "ID of the hotel",                                      # offering_id
        "Number of helpful votes this review received",         # num_helpful_votes
        "Original review date string from the source data",     # date
        "Unique identifier of the review",                      # id
        "Whether the review was posted via a mobile device",    # via_mobile
        "Date when the author posted the review (datetime)",    # review_date
        "Unique identifier of the author",                      # author_id
        "Username of the author",                               # author_name
        "Self-reported location of the author",                 # author_location
        "Total number of reviews written by the author",        # author_num_reviews
        "Number of different cities the author has reviewed",   # author_num_cities
        "Total helpful votes received by the author",           # author_num_helpful_votes
        "Number of reviews of this property type by the author",# author_num_type_reviews
        "Overall rating given in the review",                   # overall
        "Service rating given in the review",                   # service
        "Cleanliness rating given in the review",               # cleanliness
        "Value rating given in the review",                     # value
        "Location rating given in the review",                  # location_rating
        "Sleep quality rating given in the review",             # sleep_quality
        "Rooms rating given in the review",                     # rooms
    ]),
})

df_variables.to_csv(f"{DATA_DIR}/consolidated_data_variables_info.csv", index=False)
df_variables


Unnamed: 0,name,type,role,description
0,title,string,feature,Title of the review
1,text,string,feature,Text content of the review
2,date_stayed,datetime,feature,Date when the author stayed at the hotel
3,offering_id,Int64,,ID of the hotel
4,num_helpful_votes,Int64,feature,Number of helpful votes this review received
5,date,datetime,feature,Original review date string from the source data
6,id,Int64,,Unique identifier of the review
7,via_mobile,boolean,feature,Whether the review was posted via a mobile device
8,review_date,datetime,feature,Date when the author posted the review (datetime)
9,author_id,string,,Unique identifier of the author
