In [4]:
import kagglehub
import shutil
import os

# Define target directory
download_dir = "data"

# Ensure the directory exists
os.makedirs(download_dir, exist_ok=True)

# Download the dataset (default location)
path = kagglehub.dataset_download("damienbeneschi/krakow-ta-restaurans-data-raw")

# Move downloaded files to the target directory
for file in os.listdir(path):
    shutil.move(os.path.join(path, file), os.path.join(download_dir, file))

print(f"Dataset moved to: {download_dir}")


Downloading from https://www.kaggle.com/api/v1/datasets/download/damienbeneschi/krakow-ta-restaurans-data-raw?dataset_version_number=5...


100%|██████████| 7.37M/7.37M [00:03<00:00, 2.54MB/s]

Extracting files...





Dataset moved to: data


In [21]:
import pandas as pd
import numpy as np

# Read the CSV file from the data directory
df = pd.read_csv('data/TA_restaurants_curated.csv')

# Display basic information about the dataset
print("Dataset Info:")
print(df.info())

# Display first few rows and basic statistics
print("\nFirst 5 rows:")
print(df.head())

print("\nBasic statistics:")
print(df.describe())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125527 entries, 0 to 125526
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         125527 non-null  int64  
 1   Name               125527 non-null  object 
 2   City               125527 non-null  object 
 3   Cuisine Style      94176 non-null   object 
 4   Ranking            115876 non-null  float64
 5   Rating             115897 non-null  float64
 6   Price Range        77672 non-null   object 
 7   Number of Reviews  108183 non-null  float64
 8   Reviews            115911 non-null  object 
 9   URL_TA             125527 non-null  object 
 10  ID_TA              125527 non-null  object 
dtypes: float64(3), int64(1), object(7)
memory usage: 10.5+ MB
None

First 5 rows:
   Unnamed: 0                        Name       City  \
0           0  Martine of Martine's Table  Amsterdam   
1           1         De Silveren Spiegel  

In [22]:
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Initial data cleaning steps
# 1. Remove duplicate entries if any
df_clean = df.drop_duplicates()

# 2. Reset index after removing duplicates
df_clean.reset_index(drop=True, inplace=True)

# Show how many duplicates were removed
print(f"\nRows removed: {len(df) - len(df_clean)}")


Missing values:
Unnamed: 0               0
Name                     0
City                     0
Cuisine Style        31351
Ranking               9651
Rating                9630
Price Range          47855
Number of Reviews    17344
Reviews               9616
URL_TA                   0
ID_TA                    0
dtype: int64

Rows removed: 0


In [23]:
import ast

# Drop the unnamed index column and create a clean copy
df_clean = df.drop('Unnamed: 0', axis=1, errors='ignore')

# Handle missing values and data types
df_clean['Price Range'] = df_clean['Price Range'].fillna('Unknown')
df_clean['Number of Reviews'] = df_clean['Number of Reviews'].fillna(0).astype(int)
df_clean['Ranking'] = df_clean['Ranking'].fillna(df_clean['Ranking'].median())  # Filling with median as ranking is continuous
df_clean['Rating'] = df_clean['Rating'].fillna(df_clean['Rating'].median())  # Filling with median as rating is continuous

# Function to safely convert strings to lists
def safe_literal_eval(val):
    if isinstance(val, str):  # Ensure it's a string before parsing
        try:
            return ast.literal_eval(val)
        except (ValueError, SyntaxError):  # Catch malformed cases
            return [[], []]  # Default empty reviews format
    return val  # Return original if it's already a list

# Convert string representations to actual lists
df_clean['Cuisine Style'] = df_clean['Cuisine Style'].fillna('["Unspecified"]').apply(safe_literal_eval)
df_clean['Reviews'] = df_clean['Reviews'].fillna('[[],[]]').apply(safe_literal_eval)

# Standardize Price Range categories
def standardize_price(price):
    price_map = {
        '$': 'Budget',
        '$$ - $$$': 'Moderate',
        '$$$$': 'Expensive'
    }
    return price_map.get(price, 'Unknown')

df_clean['Price Range'] = df_clean['Price Range'].apply(standardize_price)

# Standardize text fields (strip whitespace, lowercase where applicable)
df_clean['City'] = df_clean['City'].str.strip().str.title()  # Capitalizing city names
df_clean['Name'] = df_clean['Name'].str.strip()  # Keeping restaurant names as-is

In [24]:
# Convert data types
df_clean['Ranking'] = df_clean['Ranking'].astype(int)
df_clean['City'] = df_clean['City'].astype('category')  
df_clean['Price Range'] = df_clean['Price Range'].astype('category')
df_clean['ID_TA'] = df_clean['ID_TA'].astype('category')  # Saves memory

In [25]:
import re

# Identify string and categorical columns
string_cols = df_clean.select_dtypes(include=['object', 'string', 'category']).columns

# Detect Strange Characters in String Columns
def find_strange_chars(column):
    unique_values = df_clean[column].astype(str).unique()  # Convert lists to strings
    for value in unique_values:
        if re.search(r'[^\x00-\x7F]', value):  # Non-ASCII character search
            print(f"🔍 Strange character found in {column}: {value}")

# Apply to all string columns
for col in string_cols:
    find_strange_chars(col)

# Check for Encoding Issues
def detect_encoding_issues(column):
    try:
        df_clean[column].apply(lambda x: str(x).encode('utf-8').decode('utf-8'))  # Ensure conversion to string
    except UnicodeDecodeError:
        print(f"⚠️ Encoding issue detected in column: {column}")

# Apply to all string columns
for col in string_cols:
    detect_encoding_issues(col)

print("✅ Strange character and encoding check complete.")


🔍 Strange character found in Name: Benny’s Chicken
🔍 Strange character found in Name: Fashion’s Restaurant & Bar
🔍 Strange character found in Name: Lotti’s Cafe, Bar & Grill
🔍 Strange character found in Name: Noah’s Arq
🔍 Strange character found in Name: Dunkin’ Donuts
🔍 Strange character found in Name: Mama’s Restaurant & Cafe
🔍 Strange character found in Name: Lucy’S Cheesecake and Coffee
🔍 Strange character found in Name: Ferilli’s
🔍 Strange character found in Name: Shakie’s
🔍 Strange character found in Name: Cafe’ De Pianist
🔍 Strange character found in Name: Chicano’s
🔍 Strange character found in Name: Papa John’s
🔍 Strange character found in Name: Kokkinos Krίnos
🔍 Strange character found in Name: Τaverna Spyros - Antonis
🔍 Strange character found in Name: Εxou
🔍 Strange character found in Name: Η Παληα Αθηνα
🔍 Strange character found in Name: Τhe Greco’s Sea Prj Monastiraki
🔍 Strange character found in Name: 8am – 11pm Cafe
🔍 Strange character found in Name: Τα Φιλαρακια
🔍 Stran

In [26]:
import ftfy

# Function to fix encoding errors automatically
def fix_text_encoding(text):
    return ftfy.fix_text(text) if isinstance(text, str) else text

# Apply to all text columns
for col in df_clean.select_dtypes(include=['object', 'string', 'category']).columns:
    df_clean[col] = df_clean[col].apply(fix_text_encoding)

print("✅ Encoding issues fixed using ftfy!")


✅ Encoding issues fixed using ftfy!


In [28]:
# Replace -1 ratings with 0
df_clean.loc[df_clean['Rating'] == -1, 'Rating'] = 0

print("✅ Fixed: Replaced -1 ratings with 0.")

✅ Fixed: Replaced -1 ratings with 0.


In [29]:
# Save the cleaned dataset
df_clean.to_csv("data/tripadvisor_cleaned.csv", index=False)

print("Data cleaning complete. Cleaned file saved as 'tripadvisor_cleaned.csv'.")

Data cleaning complete. Cleaned file saved as 'tripadvisor_cleaned.csv'.


In [30]:
# Check unique values in Price Range
print("Unique Price Range values:")
print(df_clean['Price Range'].unique())

# Check a few examples of Cuisine Style values to understand their format
print("\nSample Cuisine Style values:")
print(df_clean['Cuisine Style'].head())

# Check a few examples of Reviews values to understand their format
print("\nSample Reviews values:")
print(df_clean['Reviews'].head())

Unique Price Range values:
['Moderate', 'Expensive', 'Budget', 'Unknown']
Categories (4, object): ['Budget', 'Expensive', 'Moderate', 'Unknown']

Sample Cuisine Style values:
0                            [French, Dutch, European]
1    [Dutch, European, Vegetarian Friendly, Gluten ...
2    [Mediterranean, French, International, Europea...
3    [French, European, International, Contemporary...
4    [Dutch, European, International, Vegetarian Fr...
Name: Cuisine Style, dtype: object

Sample Reviews values:
0    [[Just like home, A Warm Welcome to Wintry Ams...
1    [[Great food and staff, just perfect], [01/06/...
2    [[Satisfaction, Delicious old school restauran...
3    [[True five star dinner, A superb evening of f...
4    [[Best meal.... EVER, super food experience], ...
Name: Reviews, dtype: object


In [31]:
# Display updated info
print("Updated Dataset Info:")
print(df_clean.info())

# Show missing values after cleaning
print("\nRemaining missing values:")
print(df_clean.isnull().sum())

Updated Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125527 entries, 0 to 125526
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   Name               125527 non-null  object  
 1   City               125527 non-null  category
 2   Cuisine Style      125527 non-null  object  
 3   Ranking            125527 non-null  int32   
 4   Rating             125527 non-null  float64 
 5   Price Range        125527 non-null  category
 6   Number of Reviews  125527 non-null  int32   
 7   Reviews            125527 non-null  object  
 8   URL_TA             125527 non-null  object  
 9   ID_TA              125527 non-null  category
dtypes: category(3), float64(1), int32(2), object(4)
memory usage: 11.5+ MB
None

Remaining missing values:
Name                 0
City                 0
Cuisine Style        0
Ranking              0
Rating               0
Price Range          0
Number of Reviews    

In [32]:
df_clean.describe()

Unnamed: 0,Ranking,Rating,Number of Reviews
count,125527.0,125527.0,125527.0
mean,3549.714022,3.988732,107.888239
std,3580.449242,0.650013,291.776974
min,1.0,0.0,0.0
25%,1051.0,3.5,4.0
50%,2256.0,4.0,22.0
75%,4892.5,4.5,92.0
max,16444.0,5.0,16478.0
