In [9]:
import pandas as pd
import time

start = time.time()
df = pd.read_csv('Hotel_Reviews.csv')  
end = time.time()
print(f"Data loaded in {round(end - start, 2)} seconds")

Data loaded in 3.22 seconds


In [10]:
print(f"DataFrame shape: {df.shape}")  # Rows x Columns

# Frequency count of reviewer nationalities
nationality_counts = df['Reviewer_Nationality'].value_counts()
print(f"\nTop 10 reviewer nationalities:\n{nationality_counts.head(10)}")

# Distinct nationalities
print(f"\nNumber of distinct nationalities: {df['Reviewer_Nationality'].nunique()}")
print(f"First 10 nationalities:\n{df['Reviewer_Nationality'].unique()[:10]}")

DataFrame shape: (515738, 17)

Top 10 reviewer nationalities:
Reviewer_Nationality
United Kingdom               245246
United States of America      35437
Australia                     21686
Ireland                       14827
United Arab Emirates          10235
Saudi Arabia                   8951
Netherlands                    8772
Switzerland                    8678
Germany                        7941
Canada                         7894
Name: count, dtype: int64

Number of distinct nationalities: 227
First 10 nationalities:
[' Russia ' ' Ireland ' ' Australia ' ' United Kingdom ' ' New Zealand '
 ' Poland ' ' Belgium ' ' Norway ' ' France ' ' Italy ']


In [11]:
most_common_nationality = nationality_counts.idxmax()
most_common_nationality_count = nationality_counts.max()
print(f"Most common nationality: {most_common_nationality} ({most_common_nationality_count} reviews)")

print("\nTop 10 nationalities and review counts:")
print(nationality_counts.head(10))


Most common nationality:  United Kingdom  (245246 reviews)

Top 10 nationalities and review counts:
Reviewer_Nationality
United Kingdom               245246
United States of America      35437
Australia                     21686
Ireland                       14827
United Arab Emirates          10235
Saudi Arabia                   8951
Netherlands                    8772
Switzerland                    8678
Germany                        7941
Canada                         7894
Name: count, dtype: int64


In [12]:
print("Most reviewed hotel for each of the top 10 nationalities:")
for nationality in nationality_counts.head(10).index:
    top_hotel = df[df['Reviewer_Nationality'] == nationality]['Hotel_Name'].value_counts().idxmax()
    print(f"{nationality}: {top_hotel}")


Most reviewed hotel for each of the top 10 nationalities:
 United Kingdom : Britannia International Hotel Canary Wharf
 United States of America : Hotel Esther a
 Australia : Park Plaza Westminster Bridge London
 Ireland : Copthorne Tara Hotel London Kensington
 United Arab Emirates : Millennium Hotel London Knightsbridge
 Saudi Arabia : The Cumberland A Guoman Hotel
 Netherlands : Jaz Amsterdam
 Switzerland : Hotel Da Vinci
 Germany : Hotel Da Vinci
 Canada : St James Court A Taj Hotel London


In [13]:
hotel_review_counts = df['Hotel_Name'].value_counts()
print("Top 10 hotels by number of reviews:")
print(hotel_review_counts.head(10))


Top 10 hotels by number of reviews:
Hotel_Name
Britannia International Hotel Canary Wharf           4789
Strand Palace Hotel                                  4256
Park Plaza Westminster Bridge London                 4169
Copthorne Tara Hotel London Kensington               3578
DoubleTree by Hilton Hotel London Tower of London    3212
Grand Royale London Hyde Park                        2958
Holiday Inn London Kensington                        2768
Hilton London Metropole                              2628
Millennium Gloucester Hotel London                   2565
Intercontinental London The O2                       2551
Name: count, dtype: int64


In [14]:
# Group by hotel and calculate mean of Reviewer_Score
calc_avg_scores = df.groupby('Hotel_Name')['Reviewer_Score'].mean().round(2)
df['Calc_Average_Score'] = df['Hotel_Name'].map(calc_avg_scores)


In [21]:
# Count rows where rounded Average_Score == rounded Calc_Average_Score
matched = df[df['Average_Score'].round(1) == df['Calc_Average_Score'].round(1)]
print(f"Rows where rounded Average_Score equals calculated: {len(matched)}")


Rows where rounded Average_Score equals calculated: 193754


In [23]:
mismatches = df[df['Average_Score'].round(1) != df['Calc_Average_Score'].round(1)]
print(f"Total mismatches: {len(mismatches)}")
print("First 10 mismatches:")
mismatches[['Hotel_Name', 'Average_Score', 'Calc_Average_Score']].head(10)

Total mismatches: 321984
First 10 mismatches:


Unnamed: 0,Hotel_Name,Average_Score,Calc_Average_Score
0,Hotel Arena,7.7,7.84
1,Hotel Arena,7.7,7.84
2,Hotel Arena,7.7,7.84
3,Hotel Arena,7.7,7.84
4,Hotel Arena,7.7,7.84
5,Hotel Arena,7.7,7.84
6,Hotel Arena,7.7,7.84
7,Hotel Arena,7.7,7.84
8,Hotel Arena,7.7,7.84
9,Hotel Arena,7.7,7.84


In [25]:
no_neg = (df['Negative_Review'] == "No Negative").sum()
no_pos = (df['Positive_Review'] == "No Positive").sum()
both = ((df['Negative_Review'] == "No Negative") & (df['Positive_Review'] == "No Positive")).sum()

print(f"Rows with 'No Negative': {no_neg}")
print(f"Rows with 'No Positive': {no_pos}")
print(f"Rows with both: {both}")

Rows with 'No Negative': 127890
Rows with 'No Positive': 35946
Rows with both: 127
