In [36]:
import json
import sqlite3
import pandas as pd
pd.set_option('display.max_rows', 100)

In [37]:
JSON_FILE = "review.json"        
SQLITE_DB = "reviews_sample.db"

In [38]:
conn = sqlite3.connect(SQLITE_DB)
query_tables = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(query_tables, conn)
# number of tables in db = 3
tables

Unnamed: 0,name
0,sqlite_sequence
1,authors
2,hotels
3,reviews


In [39]:
table_name = 'authors'
authors_df = pd.read_sql(f"SELECT * FROM {table_name}", conn)

print(f"Data Shape: Rows: {authors_df.shape[0]}, Columns: {authors_df.shape[1]}")  # (rows, columns)
authors_df.describe()

Data Shape: Rows: 524023, Columns: 8


Unnamed: 0,author_no,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews
count,524023.0,523969.0,357160.0,386588.0,273448.0
mean,262012.0,11.610378,9.543325,12.302878,10.183724
std,151272.554393,19.744705,10.475232,24.292725,10.837735
min,1.0,1.0,2.0,1.0,3.0
25%,131006.5,1.0,3.0,2.0,4.0
50%,262012.0,4.0,6.0,5.0,7.0
75%,393017.5,14.0,12.0,13.0,12.0
max,524023.0,935.0,284.0,964.0,330.0


In [40]:
# just to see how data looks like
print(f"pre-cleaning - Rows: {len(authors_df)}")
authors_df.tail(30)

pre-cleaning - Rows: 524023


Unnamed: 0,author_no,author_id,author_name,author_location,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews
523993,523994,33A67B4E7BE6FCB424443A9B0BFF5092,emanuelelan,,6.0,5.0,,
523994,523995,3289BAB92143000EF7A7FD53F27F081D,Moefra,"Reggio Emilia, Italy",1.0,,,
523995,523996,AB4D79AEB76DB5ABF44C83FEA5725F52,Giovanna_JL,"Roma, Italia",15.0,8.0,2.0,3.0
523996,523997,579AE775B45DC0FA61DB8F50B44DE421,hqyap,"Osaka, Japan",1.0,,,
523997,523998,F27DC487CC30FEF8282757C7766EB894,Claudio71,Macerata,19.0,16.0,6.0,8.0
523998,523999,AEB751B93867C601C484E71D24949CE8,Alberto F,,1.0,,1.0,
523999,524000,BA6DDA3ACBA092798716CA95398238ED,Sissiundfranz,,11.0,10.0,2.0,8.0
524000,524001,F114FB8FD81F6EF8308F5D3E24A67A0D,kent991230,kobe,1.0,,,
524001,524002,9946721D7459D75DF238FE6E01A3DF51,absolute_girl,"Montreal, Canada",8.0,7.0,7.0,7.0
524002,524003,1E47A5870989B58AB2FFAF61A5B6B40E,Docdodo,"Besana in Brianza, Italy",87.0,33.0,26.0,25.0


In [None]:
sorted_dupe_authors = authors_df[authors_df.duplicated(subset=['author_id'], keep=False)].sort_values(by='author_id')
print(f"Number of duplicate rows:\t{len(sorted_dupe_authors)}")

full_names_before = set(authors_df['author_name'].unique())

duped_names_before = set(sorted_dupe_authors['author_name'].unique())
print(f"Unique names from authors before clean:{len(full_names_before)}")
print(f"Unique names from duplicates before clean:{len(duped_names_before)}")

sorted_dupe_authors[0:10]
# sorted_dupe_authors[0:100]
# sorted_dupe_authors[101:200]
# sorted_dupe_authors[201:300]
# sorted_dupe_authors[sorted_dupe_authors['author_name'] == 'Daniel B']
# sorted_dupe_authors[sorted_dupe_authors['author_name'] == 'niki16']
# sorted_dupe_authors[sorted_dupe_authors['author_name'] == 'Bellwmu']




Number of duplicate rows:	2062
Unique names from authors before clean:483346
Unique names from duplicates before clean:1120


'\nLOGIC\nThis is another unique case where two author id has different name, we will merge based off same \n'

In [72]:
'''
LOGIC
This is another unique case where two author id has different name, we will merge based off same 
'''
sorted_dupe_authors[sorted_dupe_authors['author_id'] == '1B27A5219983ACC4A0269FAF6F7E20C5']

Unnamed: 0,author_no,author_id,author_name,author_location,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews,completeness_score
425516,425517,1B27A5219983ACC4A0269FAF6F7E20C5,Bellwmu,"Ann Arbor, MI",23.0,14.0,14.0,12.0,4
162807,162808,1B27A5219983ACC4A0269FAF6F7E20C5,niki16,"Ann Arbor, MI",23.0,14.0,14.0,12.0,4


In [65]:
'''
LOGIC
Removing duplicates for author's table, can see above code. Observed that author_name is duplicated but author_no is unique. The logic
is assumed that a higher author no (eg: 25107 vs 405914) has 405914 to be a more recent record. We can remove the duplicates 
following the below heuristic (earliest takes higher priority)

1. Completeness of author_num_reviews, author_num_cities, author_num_helpful_votes, author_num_type_reviews
    Rows with 4/4 valid columns come first
2. Higher author_no == newer data, come first

- if there are 2 duplicate rows, see "sorted_dupe_authors[sorted_dupe_authors['author_name'] == 'Daniel B']"
    we will prioritise the earlier record that has lesser NaN values
'''

cols_to_check = ['author_num_reviews', 'author_num_cities', 'author_num_helpful_votes', 'author_num_type_reviews']
authors_df['completeness_score'] = authors_df[cols_to_check].notna().sum(axis=1)

authors_df = authors_df.sort_values(by=['completeness_score', 'author_no'], ascending=[False, False])

authors_deduped = authors_df.drop_duplicates(subset=['author_id'], keep='first')
authors_deduped = authors_deduped.drop(columns=['completeness_score']).reset_index(drop=True)

names_after = set(authors_deduped['author_name'].unique())

print(f"Unique names after: {len(names_after)}")
print(f"Original count: {len(authors_df)}")
print(f"Cleaned count: {len(authors_deduped)}")
authors_deduped.head()

Unique names after: 483164
Original count: 524023
Cleaned count: 522889


Unnamed: 0,author_no,author_id,author_name,author_location,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews
0,524018,372E83E5CC6D5C9D4B803C37AC7A32D1,sofia2000,Emilia Romagna,10.0,8.0,6.0,7.0
1,524017,8C0A5D7AAAC626F189C01058F464A47E,MIssBeaker,"Hamburg, Germany",6.0,5.0,3.0,5.0
2,524016,7B2BF2CBA618F5EF57ABD77B8BCB5DF7,JamesD1975,Germany,17.0,12.0,3.0,8.0
3,524015,707465F74FC1695FCB60DB75D3354948,Loukoume,"Paris, France",15.0,8.0,7.0,8.0
4,524014,F97F67FB925EA7BAA61EBE4FADFEA85B,thegarlicheads,,7.0,7.0,2.0,6.0


In [47]:
# sanity check to ensure no duplicated author_id
sorted_dupe_author_id = authors_deduped[authors_deduped.duplicated(subset=['author_id'], keep=False)].sort_values(by='author_id')
print(len(sorted_dupe_author_id))

0


In [54]:
# sanity check to ensure no duplicated author_id
sorted_dupe_author_no = authors_deduped[authors_deduped.duplicated(subset=['author_no'], keep=False)].sort_values(by='author_no')
print(len(sorted_dupe_author_no))

0


In [None]:
# ISSUE: duplicated author_names
sorted_dupe_author_name = authors_deduped[authors_deduped.duplicated(subset=['author_name'], keep=False)].sort_values(by='author_name')
print(len(sorted_dupe_author_name))

49899


In [None]:
# CHECK: same name but location differs

from IPython.display import display

name_counts = authors_deduped['author_name'].value_counts()
duplicate_names = name_counts[name_counts > 1].index

print(len(duplicate_names))

# adjust the split range duplicated_names[x] to see specific entry and duplicated_names[1:10] to see a few examples
common_name = duplicate_names[1:2]
for c in common_name:
    print(f"Duplicates for name: {c}") 
    
    entries = authors_deduped[authors_deduped['author_name'] == c]
    display(entries[['author_id', 'author_name', 'author_location', 'author_num_reviews', 'author_num_cities', 'author_num_helpful_votes', 'author_num_type_reviews']].head(20))

# OBSERVATION: we assume that the name is duplicated but author_location is unique. That means they are actually different customers
# we can also observe the author_num_reviews, author_num_cities, author_num_helpful_votes, author_num_type_reviews are not very similar
# TODO: should we clean location to countries only?


10174
Duplicates for name: David B


Unnamed: 0,author_id,author_name,author_location,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews
15911,CC08AFD28033FB780AD1D32EBA0E4980,David B,"Columbus, Ohio",26.0,10.0,15.0,6.0
17684,62630A1D0714FEF90C4CD0F38368D657,David B,"Huddersfield, United Kingdom",25.0,13.0,8.0,9.0
21692,833AE3396D8D0B2B114186DF8DD9F103,David B,"New York City, New York",14.0,5.0,7.0,3.0
48108,EAD2FB887C71DD2C62B18185CE12A20D,David B,"Fort Myers, Florida",6.0,6.0,5.0,6.0
49123,8729A8290B01E3767637A0505936101B,David B,Indianapo,3.0,3.0,1.0,3.0
66796,37720149867226EE0A300A3F6A45B937,David B,"Montgomery, Alabama",10.0,5.0,4.0,4.0
70769,12565BD91DEB2B0FC0AF15A14028AD1C,David B,"Atlanta, Georgia",4.0,4.0,5.0,4.0
95050,A6183369278DAEF37748B1F0E5ACB52A,David B,"Fort Worth, Texas",12.0,3.0,3.0,3.0
111050,70CAB47D53DCA2BBAC69E9F99812044C,David B,"Blacksburg, VA",18.0,17.0,18.0,18.0
115504,A54C244741654674EAE6890FA5CA9A25,David B,Chicago,41.0,21.0,12.0,13.0


In [79]:
print(f"Original Data Shape: \t\t Rows: {authors_df.shape[0]}, Columns: {authors_df.shape[1]}")  # (rows, columns)
authors_df.describe()

print(f"Data Shape after Deduped: \t Rows: {authors_deduped.shape[0]}, Columns: {authors_deduped.shape[1]}")  # (rows, columns)
authors_deduped.describe()
authors_df.head()

Original Data Shape: 		 Rows: 524023, Columns: 9
Data Shape after Deduped: 	 Rows: 522889, Columns: 8


Unnamed: 0,author_no,author_id,author_name,author_location,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews,completeness_score
524017,524018,372E83E5CC6D5C9D4B803C37AC7A32D1,sofia2000,Emilia Romagna,10.0,8.0,6.0,7.0,4
524016,524017,8C0A5D7AAAC626F189C01058F464A47E,MIssBeaker,"Hamburg, Germany",6.0,5.0,3.0,5.0,4
524015,524016,7B2BF2CBA618F5EF57ABD77B8BCB5DF7,JamesD1975,Germany,17.0,12.0,3.0,8.0,4
524014,524015,707465F74FC1695FCB60DB75D3354948,Loukoume,"Paris, France",15.0,8.0,7.0,8.0,4
524013,524014,F97F67FB925EA7BAA61EBE4FADFEA85B,thegarlicheads,,7.0,7.0,2.0,6.0,4


In [80]:
authors_deduped.head()

Unnamed: 0,author_no,author_id,author_name,author_location,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews
0,524018,372E83E5CC6D5C9D4B803C37AC7A32D1,sofia2000,Emilia Romagna,10.0,8.0,6.0,7.0
1,524017,8C0A5D7AAAC626F189C01058F464A47E,MIssBeaker,"Hamburg, Germany",6.0,5.0,3.0,5.0
2,524016,7B2BF2CBA618F5EF57ABD77B8BCB5DF7,JamesD1975,Germany,17.0,12.0,3.0,8.0
3,524015,707465F74FC1695FCB60DB75D3354948,Loukoume,"Paris, France",15.0,8.0,7.0,8.0
4,524014,F97F67FB925EA7BAA61EBE4FADFEA85B,thegarlicheads,,7.0,7.0,2.0,6.0


In [82]:
# CLEAN: replace NaN with 0 
cols_to_fix = ['author_num_reviews', 'author_num_cities', 'author_num_helpful_votes', 'author_num_type_reviews']
authors_deduped[cols_to_fix] = authors_deduped[cols_to_fix].fillna(0)

# CLEAN: replace floats with integers
authors_deduped[cols_to_fix] = authors_deduped[cols_to_fix].astype(int)

print("Verification check for null")
print(authors_deduped[cols_to_fix].isnull().sum())
print()
print("Verification check for datatype")
print(authors_deduped[cols_to_fix].dtypes)
authors_deduped.head()

Verification check for null
author_num_reviews          0
author_num_cities           0
author_num_helpful_votes    0
author_num_type_reviews     0
dtype: int64

Verification check for datatype
author_num_reviews          int64
author_num_cities           int64
author_num_helpful_votes    int64
author_num_type_reviews     int64
dtype: object


Unnamed: 0,author_no,author_id,author_name,author_location,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews
0,524018,372E83E5CC6D5C9D4B803C37AC7A32D1,sofia2000,Emilia Romagna,10,8,6,7
1,524017,8C0A5D7AAAC626F189C01058F464A47E,MIssBeaker,"Hamburg, Germany",6,5,3,5
2,524016,7B2BF2CBA618F5EF57ABD77B8BCB5DF7,JamesD1975,Germany,17,12,3,8
3,524015,707465F74FC1695FCB60DB75D3354948,Loukoume,"Paris, France",15,8,7,8
4,524014,F97F67FB925EA7BAA61EBE4FADFEA85B,thegarlicheads,,7,7,2,6


In [None]:
# TODO: are we happy with this state? i think author_location can be improved, separate them to two columns, state and country

Original Data Shape: 		 Rows: 524023, Columns: 9
Data Shape after Deduped: 	 Rows: 522889, Columns: 8


Unnamed: 0,author_no,author_id,author_name,author_location,author_num_reviews,author_num_cities,author_num_helpful_votes,author_num_type_reviews,completeness_score
524017,524018,372E83E5CC6D5C9D4B803C37AC7A32D1,sofia2000,Emilia Romagna,10.0,8.0,6.0,7.0,4
524016,524017,8C0A5D7AAAC626F189C01058F464A47E,MIssBeaker,"Hamburg, Germany",6.0,5.0,3.0,5.0,4
524015,524016,7B2BF2CBA618F5EF57ABD77B8BCB5DF7,JamesD1975,Germany,17.0,12.0,3.0,8.0,4
524014,524015,707465F74FC1695FCB60DB75D3354948,Loukoume,"Paris, France",15.0,8.0,7.0,8.0,4
524013,524014,F97F67FB925EA7BAA61EBE4FADFEA85B,thegarlicheads,,7.0,7.0,2.0,6.0,4
