In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
 
 
spark = SparkSession.builder.getOrCreate()
 
 
spark.conf.set(
    "fs.azure.account.key.airbnbpro.dfs.core.windows.net",
    "lVruAoAQKhAWULpLEB3Av5hkYRO/t8huLF/Dw3ADus1RwER1V1YBnfRHST9FYKkcWTtcoecSeQ+p+AStOwPu+g=="
)
 

In [0]:
import requests
import gzip
import shutil
from pathlib import Path
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
import re


class AirbnbScraper:
    def __init__(self):
        self.local_tmp_dir = Path("/tmp/airbnb_downloads")
        self.local_tmp_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = "abfss://airbnbdata@airbnbpro.dfs.core.windows.net/raw_data"
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
            }
        )


    def scrape_city_urls(self):
        print("🌐 Scraping Inside Airbnb website...")
        try:
            response = self.session.get(
                "https://insideairbnb.com/get-the-data/", timeout=30
            )
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")
            cities_data = {}
            print("   🔍 Searching for data links...")
            all_links = soup.find_all("a", href=True)
            for link in all_links:
                href = link["href"]
                if "data.insideairbnb.com" in href and "listings.csv.gz" in href:
                    city_data = self.extract_city_from_url(href, link)
                    if city_data and city_data["city_name"] not in cities_data:
                        cities_data[city_data["city_name"]] = city_data
                        print(f"   ✅ Found: {city_data['display_name']}")
            print("   🔍 Searching in page content...")
            text_content = soup.get_text()
            url_pattern = r'https://data\.insideairbnb\.com/[^\s<>"]+?listings\.csv\.gz'
            url_matches = re.findall(url_pattern, text_content)
            for url in url_matches:
                city_data = self.extract_city_from_url(url, None)
                if city_data and city_data["city_name"] not in cities_data:
                    cities_data[city_data["city_name"]] = city_data
                    print(f"   ✅ Found: {city_data['display_name']}")
            print(f"🎯 Found {len(cities_data)} cities with listings data")
            return cities_data
        except Exception as e:
            print(f"❌ Error scraping: {e}")
            return self.get_fallback_urls()


    def extract_city_from_url(self, url, link_element):
        try:
            parts = url.split("/")
            if len(parts) >= 8:
                country = parts[3]
                city = parts[5]
                date = parts[6]
                city_clean = self.clean_filename(city)
                if link_element and link_element.get_text().strip():
                    display_name = link_element.get_text().strip()
                else:
                    display_name = f"{city.replace('-', ' ').title()}, {country.replace('-', ' ').title()}"
                return {
                    "city_name": city_clean,
                    "country_name": country,
                    "display_name": display_name,
                    "listings_url": url,
                    "date": date,
                }
            return None
        except Exception as e:
            print(f"   Error parsing URL {url}: {e}")
            return None


    def clean_filename(self, name):
        clean = re.sub(r"[^a-zA-Z0-9_]", "_", name)
        clean = re.sub(r"_+", "_", clean)
        clean = clean.strip("_")
        return clean.lower()


    def get_fallback_urls(self):
        print("🔄 Using fallback URLs...")
        fallback_cities = {
            "berlin": {
                "city_name": "berlin",
                "country_name": "germany",
                "display_name": "Berlin, Germany",
                "listings_url": "https://data.insideairbnb.com/germany/be/berlin/2025-09-23/data/listings.csv.gz",
                "date": "2025-09-23",
            },
            "london": {
                "city_name": "london",
                "country_name": "uk",
                "display_name": "London, UK",
                "listings_url": "https://data.insideairbnb.com/united-kingdom/england/london/2025-09-23/data/listings.csv.gz",
                "date": "2025-09-23",
            },
            "paris": {
                "city_name": "paris",
                "country_name": "france",
                "display_name": "Paris, France",
                "listings_url": "https://data.insideairbnb.com/france/ile-de-france/paris/2025-09-23/data/listings.csv.gz",
                "date": "2025-09-23",
            },
            "new_york": {
                "city_name": "new_york",
                "country_name": "usa",
                "display_name": "New York, USA",
                "listings_url": "https://data.insideairbnb.com/united-states/ny/new-york-city/2025-09-23/data/listings.csv.gz",
                "date": "2025-09-23",
            },
            "los_angeles": {
                "city_name": "los_angeles",
                "country_name": "usa",
                "display_name": "Los Angeles, USA",
                "listings_url": "https://data.insideairbnb.com/united-states/ca/los-angeles/2025-09-23/data/listings.csv.gz",
                "date": "2025-09-23",
            },
            "tokyo": {
                "city_name": "tokyo",
                "country_name": "japan",
                "display_name": "Tokyo, Japan",
                "listings_url": "https://data.insideairbnb.com/japan/kant%C5%8D/tokyo/2025-09-23/data/listings.csv.gz",
                "date": "2025-09-23",
            },
        }
        print(f"🎯 Loaded {len(fallback_cities)} fallback cities")
        return fallback_cities


    def download_city_listings(self, city_data):
        try:
            city_name = city_data["city_name"]
            listings_url = city_data["listings_url"]
            display_name = city_data["display_name"]
            date = city_data["date"]

            filename = f"{city_name}.csv"
            local_output_path = self.local_tmp_dir / filename

            print(f"\n📥 Downloading: {display_name}")
            print(f"    Date: {date}")
            print(f"    URL: {listings_url}")

            if self.download_and_extract_gz(listings_url, local_output_path):
                print(f"✅ Success: {filename}")

                # Read with pandas and add city column
                print("    📊 Reading CSV with pandas...")
                df_pd = pd.read_csv(local_output_path)
                df_pd["city"] = city_name
                print(f"    📈 Loaded {len(df_pd)} rows")

                # Convert to Spark DataFrame
                print("    ⚡ Converting to Spark DataFrame...")
                df_spark = spark.createDataFrame(df_pd)

                # Write to temporary directory first
                temp_dir = f"{self.output_dir}/temp_{city_name}"
                final_path = f"{self.output_dir}/{city_name}.csv"
                
                print("    💾 Writing to temporary directory...")
                # Write as single CSV with proper options
                df_spark.coalesce(1).write.mode("overwrite") \
                    .option("header", True) \
                    .option("quote", '"') \
                    .option("escape", '"') \
                    .option("quoteAll", True) \
                    .csv(temp_dir)
                
                # Find the part file and move it to final location
                print(f"    📦 Finding part file in temp directory...")
                part_files = dbutils.fs.ls(temp_dir)
                csv_part_file = [f for f in part_files if f.name.startswith("part-") and f.name.endswith(".csv")]
                
                if not csv_part_file:
                    print(f"    ❌ No part file found in {temp_dir}")
                    # List what's actually there for debugging
                    print("    Files found:")
                    for f in part_files:
                        print(f"      - {f.name}")
                    return False
                
                part_file = csv_part_file[0]
                print(f"    ✅ Found: {part_file.name}")
                
                # Copy the part file to final destination with clean name
                print(f"    📋 Copying to final location: {final_path}")
                dbutils.fs.cp(part_file.path, final_path)
                
                # Clean up temporary directory
                print(f"    🗑️  Cleaning up temp directory...")
                dbutils.fs.rm(temp_dir, recurse=True)

                print(f"    ✅ Saved to: {final_path}")
                
                # Verify the file exists
                try:
                    file_info = dbutils.fs.ls(final_path)
                    print(f"    ✓ Verified: File exists ({file_info[0].size / (1024*1024):.2f} MB)")
                except:
                    print(f"    ⚠️  Warning: Could not verify file")
                
                return True
            else:
                print(f"❌ Failed: {city_name}")
                return False

        except Exception as e:
            print(f"❌ Error with {city_name}: {e}")
            import traceback
            traceback.print_exc()
            
            # Cleanup temp directory if it exists
            try:
                temp_dir = f"{self.output_dir}/temp_{city_name}"
                dbutils.fs.rm(temp_dir, recurse=True)
                print(f"    🗑️  Cleaned up temp directory after error")
            except:
                pass
            
            return False


    def download_and_extract_gz(self, url, output_path):
        try:
            delay = random.uniform(3, 6)
            print(f"    ⏳ Waiting {delay:.1f} seconds...")
            time.sleep(delay)
            print("    🌐 Sending request...")
            with requests.Session() as download_session:
                download_session.headers.update(
                    {
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                        "Referer": "https://insideairbnb.com/",
                        "Accept": "*/*",
                    }
                )
                response = download_session.get(url, stream=True, timeout=60)
                if response.status_code != 200:
                    print(f"    ❌ HTTP {response.status_code}: {response.reason}")
                    return False
                total_size = int(response.headers.get("content-length", 0))
                downloaded_size = 0
                file_size_mb = (
                    total_size / (1024 * 1024) if total_size > 0 else "unknown"
                )
                print(f"    📦 Downloading file ({file_size_mb:.1f} MB)...")
                temp_gz_path = self.local_tmp_dir / "temp_listings.gz"
                with open(temp_gz_path, "wb") as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            downloaded_size += len(chunk)
                            if total_size > 0:
                                percent = (downloaded_size / total_size) * 100
                                print(f"    Progress: {percent:.1f}%", end="\r")
                print("\n    ✅ Download completed!")
                print("    📦 Extracting file...")
                with gzip.open(temp_gz_path, "rb") as f_in:
                    with open(output_path, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
                temp_gz_path.unlink()
                if output_path.exists():
                    file_size = output_path.stat().st_size / (1024 * 1024)
                    print(f"    💾 Saved to: {output_path}")
                    print(f"    📊 File size: {file_size:.2f} MB")
                    return True
                return False
        except Exception as e:
            print(f"    ❌ Download error: {e}")
            temp_gz_path = self.local_tmp_dir / "temp_listings.gz"
            if temp_gz_path.exists():
                temp_gz_path.unlink()
            return False


def main():
    scraper = AirbnbScraper()
    print("🏠 Airbnb Data Scraper")
    print(f"📂 Output: {scraper.output_dir}")
    print("🔍 Scanning website for city data...\n")
    
    cities_data = scraper.scrape_city_urls()
    
    if not cities_data:
        print("❌ No cities found.")
        return
    
    print("\n📋 Available cities:")
    city_list = list(cities_data.items())
    for i, (city_name, city_info) in enumerate(city_list, 1):
        print(f"  {i}. {city_info['display_name']} ({city_info['date']})")
    print(f"  {len(city_list) + 1}. 🚀 DOWNLOAD ALL CITIES")
    
    try:
        choice = input("\n👉 Choose cities (e.g., 1 or 1,2,3 or all): ").strip()
        successful_downloads = 0
        total_attempts = 0
        downloaded_files = []
        
        if choice == str(len(city_list) + 1) or choice.lower() == "all":  # DOWNLOAD ALL
            print(f"\n🚀 Downloading ALL {len(city_list)} cities...")
            for city_name, city_data in city_list:
                total_attempts += 1
                if scraper.download_city_listings(city_data):
                    successful_downloads += 1
                    downloaded_files.append(f"{city_name}.csv")
                time.sleep(random.uniform(5, 8))
        else:  # Specific cities
            choices = [int(x.strip()) for x in choice.split(",")]
            for choice_num in choices:
                if 1 <= choice_num <= len(city_list):
                    city_name, city_data = city_list[choice_num - 1]
                    total_attempts += 1
                    if scraper.download_city_listings(city_data):
                        successful_downloads += 1
                        downloaded_files.append(f"{city_name}.csv")
                    time.sleep(random.uniform(5, 8))
                else:
                    print(f"❌ Invalid choice: {choice_num}")
        
        print(f"\n{'=' * 50}")
        print("📊 DOWNLOAD SUMMARY")
        print(f"{'=' * 50}")
        print(f"✅ Successful: {successful_downloads}/{total_attempts}")
        print(f"📂 Location: {scraper.output_dir}")
        if downloaded_files:
            print("📄 Files created:")
            for file in downloaded_files:
                print(f"   • {file}")
        print(f"{'=' * 50}")
        
    except KeyboardInterrupt:
        print("\n\n⚠️  Download interrupted by user")
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()


🏠 Airbnb Data Scraper
📂 Output: abfss://airbnbdata@airbnbpro.dfs.core.windows.net/raw_data
🔍 Scanning website for city data...

🌐 Scraping Inside Airbnb website...
   🔍 Searching for data links...
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listings.csv.gz
   ✅ Found: listing


👉 Choose cities (e.g., 1 or 1,2,3 or all):  119


🚀 Downloading ALL 118 cities...

📥 Downloading: listings.csv.gz
    Date: 2025-10-05
    URL: https://data.insideairbnb.com/united-states/ny/albany/2025-10-05/data/listings.csv.gz
    ⏳ Waiting 3.9 seconds...
    🌐 Sending request...
    📦 Downloading file (0.2 MB)...
    Progress: 3.6%    Progress: 7.2%    Progress: 10.8%    Progress: 14.3%    Progress: 17.9%    Progress: 21.5%    Progress: 25.1%    Progress: 28.7%    Progress: 32.3%    Progress: 35.9%    Progress: 39.4%    Progress: 43.0%    Progress: 46.6%    Progress: 50.2%    Progress: 53.8%    Progress: 57.4%    Progress: 61.0%    Progress: 64.5%    Progress: 68.1%    Progress: 71.7%    Progress: 75.3%    Progress: 78.9%    Progress: 82.5%    Progress: 86.1%    Progress: 89.6%    Progress: 93.2%    Progress: 96.8%    Progress: 100.0%
    ✅ Download completed!
    📦 Extracting file...
    💾 Saved to: /tmp/airbnb_downloads/albany.csv
    📊 File size: 1.03 MB
✅ Success: albany.csv
    📊 Reading CSV with 

Traceback (most recent call last):
  File "/home/spark-a2a45b4a-1ba9-462d-90c6-22/.ipykernel/2534/command-6159821325050266-1257879295", line 168, in download_city_listings
    df_spark = spark.createDataFrame(df_pd)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/databricks/spark/python/pyspark/sql/connect/session.py", line 601, in createDataFrame
    raise PySparkValueError(
pyspark.errors.exceptions.base.PySparkValueError: [CANNOT_INFER_EMPTY_SCHEMA] Can not infer schema from an empty dataset.


    🗑️  Cleaned up temp directory after error

📥 Downloading: listings.csv.gz
    Date: 2025-09-28
    URL: https://data.insideairbnb.com/belize/bz/belize/2025-09-28/data/listings.csv.gz
    ⏳ Waiting 4.8 seconds...
    🌐 Sending request...
    📦 Downloading file (1.6 MB)...
    Progress: 0.5%    Progress: 0.9%    Progress: 1.4%    Progress: 1.9%    Progress: 2.4%    Progress: 2.8%    Progress: 3.3%    Progress: 3.8%    Progress: 4.3%    Progress: 4.7%    Progress: 5.2%    Progress: 5.7%    Progress: 6.2%    Progress: 6.6%    Progress: 7.1%    Progress: 7.6%    Progress: 8.1%    Progress: 8.5%    Progress: 9.0%    Progress: 9.5%    Progress: 10.0%    Progress: 10.4%    Progress: 10.9%    Progress: 11.4%    Progress: 11.8%    Progress: 12.3%    Progress: 12.8%    Progress: 13.3%    Progress: 13.7%    Progress: 14.2%    Progress: 14.7%    Progress: 15.2%    Progress: 15.6%    Progress: 16.1%    Progress: 16.6%    Progress: 17.1%    Progress: 17.5%    

  df_pd = pd.read_csv(local_output_path)


    📈 Loaded 16822 rows
    ⚡ Converting to Spark DataFrame...
    💾 Writing to temporary directory...
    📦 Finding part file in temp directory...
    ✅ Found: part-00000-tid-1352542649336607485-b732f505-955f-4d2a-bed0-ce6d72dcf718-153-1-c000.csv
    📋 Copying to final location: abfss://airbnbdata@airbnbpro.dfs.core.windows.net/raw_data/broward_county.csv
    🗑️  Cleaning up temp directory...
    ✅ Saved to: abfss://airbnbdata@airbnbpro.dfs.core.windows.net/raw_data/broward_county.csv
    ✓ Verified: File exists (44.76 MB)

📥 Downloading: listings.csv.gz
    Date: 2025-06-21
    URL: https://data.insideairbnb.com/belgium/bru/brussels/2025-06-21/data/listings.csv.gz
    ⏳ Waiting 3.4 seconds...
    🌐 Sending request...
    📦 Downloading file (3.0 MB)...
    Progress: 0.3%    Progress: 0.5%    Progress: 0.8%    Progress: 1.0%    Progress: 1.3%    Progress: 1.5%    Progress: 1.8%    Progress: 2.1%    Progress: 2.3%    Progress: 2.6%    Progress: 2.8%    Progress: 3.1%    Prog

  df_pd = pd.read_csv(local_output_path)


    📈 Loaded 8274 rows
    ⚡ Converting to Spark DataFrame...
    💾 Writing to temporary directory...
    📦 Finding part file in temp directory...
    ✅ Found: part-00000-tid-1182526601415564979-7685fe6e-1b66-43ce-8092-09a8214411e3-473-1-c000.csv
    📋 Copying to final location: abfss://airbnbdata@airbnbpro.dfs.core.windows.net/raw_data/munich.csv
    🗑️  Cleaning up temp directory...
    ✅ Saved to: abfss://airbnbdata@airbnbpro.dfs.core.windows.net/raw_data/munich.csv
    ✓ Verified: File exists (16.20 MB)

📥 Downloading: listings.csv.gz
    Date: 2025-06-19
    URL: https://data.insideairbnb.com/italy/campania/naples/2025-06-19/data/listings.csv.gz
    ⏳ Waiting 6.0 seconds...
    🌐 Sending request...
    📦 Downloading file (4.9 MB)...
    Progress: 0.2%    Progress: 0.3%    Progress: 0.5%    Progress: 0.6%    Progress: 0.8%    Progress: 1.0%    Progress: 1.1%    Progress: 1.3%    Progress: 1.4%    Progress: 1.6%    Progress: 1.8%    Progress: 1.9%    Progress: 2.1%    P

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf, when
from pyspark.sql.types import StringType, DoubleType
import gender_guesser.detector as gender_detector

# Initialize gender detector
gd = gender_detector.Detector()
def predict_gender(name):
    if name is None or name == "NULL" or name == "" or name == "null":
        return "unknown"
    try:
        first_name = str(name).split()[0] if " " in str(name) else str(name)
        result = gd.get_gender(first_name)
        if result in ["male", "mostly_male"]:
            return "male"
        elif result in ["female", "mostly_female"]:
            return "female"
        else:
            return "unknown"
    except:
        return "unknown"
gender_udf = udf(predict_gender, StringType())

def safe_cast_to_double(value):
    if value is None or value == "NULL" or value == "null" or str(value).strip() == "":
        return 0.0
    try:
        cleaned = str(value).replace("$", "").replace(",", "")
        return float(cleaned)
    except:
        return 0.0
safe_cast_udf = udf(safe_cast_to_double, DoubleType())

INPUT_DIR = "abfss://airbnbdata@airbnbpro.dfs.core.windows.net/raw_data"
OUTPUT_DIR = "abfss://airbnbdata@airbnbpro.dfs.core.windows.net/cleaned_data_New"
OUTPUT_FILE = f"{OUTPUT_DIR}/airbnb_data.csv"

spark = SparkSession.builder.getOrCreate()
print("🏠 Airbnb Data Cleaning Pipeline")
print("=" * 60)

try:
    print("📥 Reading CSV files...")
    df_raw = (
        spark.read
        .option("header", "true")
        .option("escape", '"')
        .option("multiline", "true")
        .option("mode", "DROPMALFORMED")
        .csv(f"{INPUT_DIR}/*.csv")
    )
    print(f"📊 Found {len(df_raw.columns)} columns. Dropping text columns...")

    # === DROP PROBLEMATIC COLUMNS FIRST ===
    cols_to_drop = [
        "id",
        "scrape_id",
        "last_scraped",
        "source",
        "host_thumbnail_url",
        "host_picture_url",
        "host_verifications",
        "host_has_profile_pic",
        "neighbourhood",
        "neighbourhood_group_cleansed",
        "calendar_last_scraped",
        "calendar_updated",
        "license",
        "amenities"
    ]
    cols_to_drop_existing = [c for c in cols_to_drop if c in df_raw.columns]
    print(f"   Dropping columns: {cols_to_drop_existing}")
    df_cleaned = df_raw.drop(*cols_to_drop_existing)
    print(f"✅ Remaining columns: {len(df_cleaned.columns)}")

    # Now, continue with your cleaning pipeline
    if "host_name" in df_cleaned.columns:
        df_cleaned = df_cleaned.withColumn("gender", gender_udf(col("host_name")))
    else:
        df_cleaned = df_cleaned.withColumn("gender", F.lit("unknown"))

    if "name" in df_cleaned.columns:
        df_cleaned = df_cleaned.withColumn("name", when(
            (col("name").isNull()) | (col("name").isin("NULL", "null", "")),
            "Unknown Listing"
        ).otherwise(col("name")))

    if "price" in df_cleaned.columns:
        df_cleaned = df_cleaned.withColumn(
            "price_clean", F.regexp_replace(col("price"), "[$,]", "").cast(DoubleType())
        )
        df_cleaned = df_cleaned.withColumn(
            "price", when(col("price_clean").isNull(), 0.0).otherwise(col("price_clean"))
        ).drop("price_clean")

    if "has_availability" in df_cleaned.columns:
        df_cleaned = df_cleaned.withColumn(
            "has_availability",
            when(
                (col("has_availability").isNull()) | (col("has_availability").isin("NULL", "null")),
                "t"
            ).otherwise(col("has_availability"))
        )

    numeric_columns = [
        "availability_eoy", "estimated_occupancy_l365d", "number_of_reviews_ly",
        "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness",
        "review_scores_checkin", "review_scores_communication", "review_scores_location",
        "review_scores_value"
    ]
    for column in numeric_columns:
        if column in df_cleaned.columns:
            df_cleaned = df_cleaned.withColumn(column, safe_cast_udf(col(column)))

    date_columns = ["first_review", "last_review"]
    for column in date_columns:
        if column in df_cleaned.columns:
            df_cleaned = df_cleaned.withColumn(column, when(
                (col(column).isNull()) | (col(column) == "NULL"),
                "2010-01-01"
            ).otherwise(col(column)))

    print("\n🧹 Step 5: Data cleaning summary...")
    final_count = df_cleaned.count()
    print(f"   Total rows: {final_count:,}")
    print(f"   Total columns: {len(df_cleaned.columns)}")

    df_cleaned.show(10, truncate=True)
    print(f"\n💾 Saving cleaned data to {OUTPUT_FILE}...")

    temp_dir = f"{OUTPUT_DIR}/temp_airbnb_data"
    df_cleaned.coalesce(1).write.mode("overwrite") \
        .option("header", True) \
        .option("quote", '"') \
        .option("escape", '"') \
        .option("quoteAll", True) \
        .csv(temp_dir)

    part_files = dbutils.fs.ls(temp_dir)
    csv_part_file = [f for f in part_files if f.name.startswith("part-") and f.name.endswith(".csv")]
    if csv_part_file:
        part_file = csv_part_file[0]
        dbutils.fs.cp(part_file.path, OUTPUT_FILE)
        dbutils.fs.rm(temp_dir, recurse=True)
        print(f"   ✓ Saved file: {OUTPUT_FILE}")
    else:
        print("❌ Error: No part file found after write.")

    print("\n✅ SUCCESS! Cleaned file saved, columns will not be shifted.")
    print("👉 Load with .option('multiline', 'true') when reading.")

except Exception as e:
    print(f"\n❌ Error: {str(e)}")
    import traceback
    traceback.print_exc()


🏠 Airbnb Data Cleaning Pipeline
📥 Reading CSV files...
📊 Found 80 columns. Dropping text columns...
   Dropping columns: ['id', 'scrape_id', 'last_scraped', 'source', 'host_thumbnail_url', 'host_picture_url', 'host_verifications', 'host_has_profile_pic', 'neighbourhood', 'neighbourhood_group_cleansed', 'calendar_last_scraped', 'calendar_updated', 'license', 'amenities']
✅ Remaining columns: 66

🧹 Step 5: Data cleaning summary...
   Total rows: 1,502,741
   Total columns: 67
+--------------------+--------------------+--------------------+---------------------+--------------------+-------+--------------------+------------+----------+--------------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+-------------------+-------------------------+----------------------+----------------------+--------+---------+--------------------+---------------+------------+---------+--------------+--------+----+-----+--------------+

In [0]:
from pyspark.sql import SparkSession
import pandas as pd

spark = SparkSession.builder.getOrCreate()

# Read data
df = spark.read.option("header", "true").option("multiline", "true").csv(
    "abfss://airbnbdata@airbnbpro.dfs.core.windows.net/cleaned_data_New/*.csv"
)

# Convert to Pandas and display ALL columns
pdf = df.limit(50).toPandas()
display(pdf)


listing_url,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,availability_eoy,number_of_reviews_ly,estimated_occupancy_l365d,estimated_revenue_l365d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,city,gender
https://www.airbnb.com/rooms/13913,Holiday London DB Room Let-on going,"My bright double bedroom with a large window has a relaxed feeling! It comfortably fits one or two and is centrally located just two blocks from Finsbury Park. Enjoy great restaurants in the area and easy access to easy transport tubes, trains and buses. Babies and children of all ages are welcome.","Finsbury Park is a friendly melting pot community composed of Turkish, French, Spanish, Middle Eastern, Irish and English families. We have a wonderful variety of international restaurants directly under us on Stroud Green Road. And there are many shops and large Tescos supermarket right next door. But you can also venture up to Crouch End and along Greens Lanes where there will endless choice of Turkish and Middle Eastern cuisines.s",https://a0.muscache.com/pictures/miso/Hosting-13913/original/d755aa6d-cebb-4464-80be-2722c921e8d5.jpeg,54730,https://www.airbnb.com/users/show/54730,Alina,2009-11-16,"London, United Kingdom",I am a Multi-Media Visual Artist and Creative Practitioner in Education. I live in London England with a Greek/Canadian origins and work internationally. I love everything there is to be enjoyed in life and travel is on top of my list!,within a few hours,100%,96%,t,LB of Islington,2.0,5.0,t,Islington,51.56861,-0.1127,Private room in rental unit,Private room,1,1.0,1 shared bath,1.0,1.0,70.0,1,29,1.0,1.0,29.0,29.0,1.0,29.0,t,21,39,69,331,55,10,1,74.0,8.0,92.0,6440.0,2010-08-18,2025-08-21,4.85,4.8,4.8,4.81,4.87,4.78,4.78,f,2,1,1,0,0.3,london,female
https://www.airbnb.com/rooms/43129,Quiet Comfortable Room in Fulham,"Munster Village is a colourful collection of shops & restaurants plus all amnities at the end of our quiet, safe street. There are Thai, Italian, French & English restaurants to choose from plus antique shops to browse in & easy access to the centre","The area is called Munster village. It has a collection of interesting shops, vintage clothes, antique shops plus amenities such as chemist, launderette, supermarket, hairdresser, dentist, walk in medical centre, park, swimming pool, cinemas and a wealth of lovely restaurants,",https://a0.muscache.com/pictures/d90415e3-9d9f-465c-adbb-ee440e61eb71.jpg,188138,https://www.airbnb.com/users/show/188138,Sylvan,2010-08-01,"London, United Kingdom",,within an hour,100%,97%,t,Fulham,3.0,3.0,t,Hammersmith and Fulham,51.48164,-0.21082,Private room in townhouse,Private room,2,1.0,1 shared bath,1.0,3.0,48.0,3,1125,3.0,3.0,1125.0,1125.0,3.0,1125.0,t,10,40,64,244,266,22,2,81.0,10.0,202.0,9696.0,2013-07-13,2025-08-31,4.74,4.83,4.74,4.89,4.81,4.66,4.73,f,3,0,3,0,1.79,london,unknown
https://www.airbnb.com/rooms/73431,2 bed/2 bath luxury flat with city view!,The apartment is on the top floor of a portered building. From the apartment there are amazing views of both the city of London and the hills of the south.,"Brixton is a world unto itself, a volatile slice of London’s multiculturalism and a vibrant hotbed of food, drink and music. Supermarkets Sainsbury’s Local A small version of a Sainsbury’s supermarket is located beside the tube station on Brixton Road, exit the tube and turn right. There is another, generally with fewer customers, further down the same road, also on the right. Tesco Express If you walk to and from Loughborough Junction station there is a small Tesco on Coldharbour Lane just 200m from the station. Tesco Metro A full size supermarket is on Acre Lane, turn right out of the main gates, continue across the main intersection with the town hall and tower, Tesco is on the left with a large carpark out the front, approximately 10 minute walk. Pharmacy 449 Brixton Road, Brixton SW9 8HH There is a large Boots chemist at Brixton tube selling all the personal essentials. They also have a pharmacy and trained medi",https://a0.muscache.com/pictures/miso/Hosting-73431/original/cbc4f342-33ec-4230-94ca-4095dc23e574.jpeg,383525,https://www.airbnb.com/users/show/383525,Emi,2011-02-09,"London, United Kingdom","Hello, my name is Emi and I am a 37 year old professional guy . I have been living in London for over 10 years and I am a senior property manager. The apartment advertised is my home, which I share with my partner, who is a senior consultant for a well known international consulting firm. In the past years we have been furnishing it to the highest standards with most of the modern amenities. We look forward to welcoming you in our house and show you one of the most vibrant central area of London.",within an hour,100%,100%,f,Stockwell,1.0,1.0,t,Lambeth,51.46161,-0.11023,Entire condo,Entire home/apt,6,2.0,2 baths,2.0,3.0,117.0,3,120,3.0,6.0,120.0,120.0,3.1,120.0,t,0,0,0,17,110,5,1,0.0,17.0,46.0,5382.0,2013-11-07,2025-08-25,4.9,4.88,4.92,4.9,4.87,4.72,4.78,t,1,1,0,0,0.76,london,female
https://www.airbnb.com/rooms/15400,Bright Chelsea Apartment. Chelsea!,"Lots of windows and light. St Luke's Gardens are at the end of the block, and the river not too far the other way. Ten minutes walk if you go slowly. Buses to everywhere round the corner and shops, restaurants, pubs, the cinema and once again Waitrose .",It is Chelsea.,https://a0.muscache.com/pictures/428392/462d262a_original.jpg,60302,https://www.airbnb.com/users/show/60302,Philippa,2009-12-05,"Royal Borough of Kensington and Chelsea, United Kingdom","English, grandmother, I have travelled quite a lot. I love being in different countries, as long as they are warm!.  I enjoy preparing a clean comfortable flat with plenty of empty cupboards, as I would like to find things myself.  Nowadays I like to travel by sea so it is a challenge getting to where I would like to go. But that is the fun of it!",,,50%,f,Chelsea,1.0,1.0,t,Kensington and Chelsea,51.4878,-0.16813,Entire rental unit,Entire home/apt,2,1.0,1 bath,1.0,1.0,149.0,4,30,4.0,4.0,30.0,30.0,4.0,30.0,t,0,0,0,199,97,1,0,0.0,2.0,9.0,1341.0,2009-12-21,2025-04-05,4.8,4.86,4.87,4.88,4.84,4.93,4.74,f,1,1,0,0,0.51,london,female
https://www.airbnb.com/rooms/43202,Beautiful 1 bed apt in Queens Park,"A beautiful, light-filled, mid-century design inspired apartment in an attractive modern apartment block with great views across London. Perfect for couples or for a business traveller wanting a place to stay within easy reach of town and the world famous Portobello market. No parking permit available. Only on-street parking. See photo of parking restrictions in the gallery.","It's a really safe and friendly neighbourhood. The apartment is located close to many shops, great pubs, and restaurants. There's a small supermarket conveniently just one minute away. It's also a five-minute walk from the beautiful Queen's Park and 10 minutes from world-famous Portobello Road.",https://a0.muscache.com/pictures/miso/Hosting-43202/original/bafc7a6b-1518-44b3-955d-ea7278240d8c.jpeg,188559,https://www.airbnb.com/users/show/188559,Calypso,2010-08-02,"London, United Kingdom","From New Zealand originally, I'm a friendly and easy-going person. Like most kiwis I love to travel and enjoy nothing better than seeing new corners of the world and meeting new people. I also love my time at home though and have created a modern and comfortable space that reflects my interest in mid-century design and art. I love to cook and often have friends over for dinner. I get a lot of pleasure from gardening and have a little garden on the balcony I like to take care of.",within an hour,100%,100%,t,LB of Brent,1.0,1.0,t,Brent,51.53031,-0.21713,Entire rental unit,Entire home/apt,2,,1 bath,1.0,,0.0,5,30,5.0,5.0,1125.0,1125.0,5.0,1125.0,t,12,18,18,18,145,11,1,18.0,10.0,110.0,,2012-08-07,2025-08-20,4.88,4.9,4.82,4.92,4.94,4.67,4.79,f,1,1,0,0,0.91,london,unknown
https://www.airbnb.com/rooms/78606,Peaceful Room Near Epping Forest,"Comfortable, modern home with a friendly host to offer handful hints if needed.",Village feel with a high st with 24hr Tescos not far from the 24hr Leytonstone tube station,https://a0.muscache.com/pictures/4365248/9ccaa0c0_original.jpg,422362,https://www.airbnb.com/users/show/422362,Nicola,2011-03-04,"London, United Kingdom","- Loves to travel whenever I get the chance, enjoys listening to great music, socialising and cooking.",,,,f,LB of Waltham Forest,1.0,3.0,t,Waltham Forest,51.57322,0.01045,Private room in rental unit,Private room,2,1.0,1 private bath,1.0,0.0,106.0,2,90,2.0,2.0,90.0,90.0,2.0,90.0,t,30,60,90,365,2,0,0,109.0,0.0,0.0,0.0,2019-09-18,2019-09-26,5.0,5.0,5.0,5.0,5.0,5.0,5.0,f,1,0,1,0,0.03,london,female
https://www.airbnb.com/rooms/17402,Very Central Modern 3-Bed/2 Bath By Oxford St W1,"You'll have a great time in this beautiful, clean, modern, well-equipped contemporary and recently refurbished Fitzroy Street 3-bed / 2-bath, right in the heart of Central London (W1T 4BL). It is fully furnished with everything ready to provide an enjoyable and comfortable stay. There is free unlimited wifi access and unlimited Netflix. There is woodstrip flooring throughout.","Fitzrovia is a very desirable trendy, arty and foodie top location, in the heart of the West End in Central London. It's a wonderful and safe area with lots going on. Walk to tourist London or use the close by tubes. Head in any direction for a great day out. Amazing choice of restaurants, pubs and supermarkets. The neighbourhood has a villagey feel and some beautiful Georgian architecture. It was was once home to such writers as Virginia Woolf, George Bernard Shaw and Arthur Rimbaud. Today Fitzrovia is a popular and lively neighbourhood with a residential and business mix. The numerous media companies based in the area feed into a vibrant buzz.",https://a0.muscache.com/pictures/39d5309d-fba7-4ecb-8cae-383dcb3b757c.jpg,67564,https://www.airbnb.com/users/show/67564,Liz,2010-01-04,"London, United Kingdom","We are Liz and Jack. We manage a number of holiday let properties in Fitzrovia, central London and Lymington, as well as many long let properties. We look forward to welcoming you one of our super home from homes. We know Fitzrovia well and can thoroughly recommend it for its central London location, great transport links (although you'll probably be mostly on foot as tourist London is very walkable!) and wide choice of bars and restaurants. It's a highly desirable neighbourhood with beautiful architecture and interesting streets. People come from all over London to enjoy some Fitzrovia life. We're lucky enough to be based here and hope to see you here too! When we want to head out of the city, Lymington is our favourite getaway location. You'll love this quaint seaside town on the edge of the beautiful New Forest. We have been letting our properties for over 15 years. We hope we've perfected a warm and professional welcome. Really importantly, you can book with us worry-free. As you'll see from our vast number of reviews, we offer a friendly, guest-focused, helpful and absolutely reliable service. We're very happy to help with questions (before or during your stay) or to tailor any trip to suit your needs so just let us know if there's anything we can do. We look forward to seeing you in London or Lymington!",within an hour,88%,88%,t,Fitzrovia,3.0,16.0,t,Westminster,51.52195,-0.14094,Entire rental unit,Entire home/apt,6,2.0,2 baths,3.0,3.0,411.0,3,365,3.0,3.0,365.0,365.0,3.0,365.0,t,24,54,64,80,56,0,0,78.0,1.0,0.0,0.0,2011-03-21,2024-02-19,4.77,4.83,4.72,4.72,4.72,4.89,4.61,f,2,2,0,0,0.32,london,female
https://www.airbnb.com/rooms/45163,Room with a garden,"Nice room in house in the West Hampstead area - Camden. Very well connected to the town centre (the room is in zone 2, tube jubilee, overground and Thames Link) and the airports (Luton and Gatwick by train, Heathrow by tube and Heathrow express, Stansted by bus to Finchley Road). Very nice high road with easy access to local parks and Hampstead Heath. Nice coffe houses and bakeries as well as restaurants. Kitchen use limited to coffee/tea.","""West Hampstead is, to all intents and purposes, just one street - West End Lane. It is located in the London Borough of Camden, in zone two of the London Underground. It borders Belsize Park/Finchley Road to the east, and Kilburn/Willesden to the west. In the north are Cricklewood and Golders Green, whilst to the south lie uber-refined St. John's Wood and Maida Vale. The area is home to three train stations (all, confusingly, called West Hampstead!) and is a bona fide yuppie stronghold. Though West Hampstead is hardly a place that springs to mind when you think nightlife, it has a smattering of good restaurants, pubs and bars. On the whole, the area's watering holes are towards the swankier (and pricier) end of the spectrum, and places tend to feel more mature feel than, say, Camden Town. In terms of nightclubs, Lately on West End Lane is a charming old-school dancing spot, whilst the Lower Ground Bar (also on West End Lane) flaunts its status as """"West Hampstead's Premi""",https://a0.muscache.com/pictures/ec999674-6b16-4281-931e-8a3378c30a8e.jpg,199530,https://www.airbnb.com/users/show/199530,Deb,2010-08-12,"London, United Kingdom",Love travelling and meet people. Will leave property as found.,,,,f,Hampstead,2.0,2.0,t,Camden,51.55312,-0.1975,Private room in condo,Private room,1,1.0,1 private bath,1.0,1.0,76.0,2,7,3.0,5.0,7.0,7.0,3.9,7.0,t,27,57,87,267,1,0,0,105.0,0.0,0.0,0.0,2021-11-01,2021-11-01,5.0,5.0,5.0,5.0,5.0,5.0,4.0,f,1,0,1,0,0.02,london,female
https://www.airbnb.com/rooms/78892,Bright single room in family home.,"Small, bright, single room in family home, overlooking garden. In quiet road, conveniently close to buses & trains & to East Dulwich, Peckham & Camberwell shops, pubs and restaurants. Close to Kings College Hospital for those working there. Toast, jam, muesli, fruit, milk, tea & real coffee provided. Please request gluten-free or dairy-free in advance.","The house is situated in a quiet road, within easy walking distance of great shops, cafes, bars and pubs in East Dulwich, Camberwell and Peckham. We are 10-15 minutes walk away from Denmark Hill, East Dulwich and Peckham Rye stations. Three swimming pools and gyms are 20 minutes walk away. We are 5 minutes walk from 'Bellenden Village' and Dulwich Hamlet Football Club and 20 minutes walk from Camberwell Art College, South London Gallery and Kings College Hospital.",https://a0.muscache.com/pictures/048db5af-1bce-4016-83d9-4cf14875c34c.jpg,424230,https://www.airbnb.com/users/show/424230,Julia,2011-03-05,"England, United Kingdom",,,,0%,f,Camberwell,1.0,1.0,t,Southwark,51.46478,-0.08007,Private room in home,Private room,1,,1 shared bath,,,0.0,7,35,7.0,7.0,35.0,35.0,7.0,35.0,t,0,0,0,0,59,0,0,0.0,0.0,0.0,,2015-03-20,2023-06-04,4.88,4.93,4.83,4.93,4.95,4.81,4.85,f,1,0,1,0,0.46,london,female
https://www.airbnb.com/rooms/24328,Battersea live/work artist house,"Artist house by SW Battersea Park, bright high ceiling bedroom and separate studio/office, a communal garden, a secure gated car parking space (pls ask as not always available). Bedroom with Superking 180x200cm bed. The other bedroom is used as a studio with desk, chair, and a separate exercise area. Studio could be converted into a bedroom for longer stays when requested using parts of sofa. Please note the bedroom and studio have been swapped around recently, we will update new photos asap.","- Battersea is a quiet family area, easy access to Battersea Park and Clapham Junction/Lavender Hill for restaurants and shops. Walk across the bridge and you will be in Chelsea or Fulham. There are plenty of direct bus routes to central London. - Ideally located if you are in town for events in the Battersea Park (Affordable Art Fair) or the Chelsea Flower Show - Clapham Junction is perfect if you are arriving from Gatwick Airport. Direct bus from South Kensington tube station if arriving from Heathrow Airport or 30min journey by car.",https://a0.muscache.com/pictures/9194b40f-c627-4b57-931a-0b8e1ba58eb7.jpg,41759,https://www.airbnb.com/users/show/41759,Joe,2009-09-28,"London, United Kingdom","I've been using Airbnb for a while now, both as a user and as a host. My partner and I find hotels are often boring and soul-less. What a wonderful way to explore a new place by living like a local. Paolo is an artist and most of the works in our homes are his. I travel often for work in the field of brand marketing. When making a booking request, please share with us a little bit more about yourselves and the purpose of your trip/stay. We rent out our private home so I hope you can understand. Thank you.",within a few hours,100%,11%,f,Battersea,1.0,3.0,t,Wandsworth,51.47072,-0.16266,Entire townhouse,Entire home/apt,2,,1.5 baths,1.0,,0.0,7,1125,7.0,30.0,1125.0,1125.0,7.3,1125.0,t,9,24,24,294,95,1,0,34.0,0.0,14.0,,2010-11-15,2025-07-05,4.9,4.89,4.91,4.9,4.93,4.6,4.65,f,1,1,0,0,0.53,london,male


In [0]:
%pip install gender_guesser

Collecting gender_guesser
  Downloading gender_guesser-0.4.0-py2.py3-none-any.whl.metadata (3.0 kB)
Downloading gender_guesser-0.4.0-py2.py3-none-any.whl (379 kB)
Installing collected packages: gender_guesser
Successfully installed gender_guesser-0.4.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
spark.conf.set(
    "fs.azure.sas.airbnbdata.airbnbpro.blob.core.windows.net",
    "sv=2024-11-04&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2025-11-29T01:58:38Z&st=2025-11-24T17:43:38Z&spr=https,http&sig=34nGgovEO3UkYSw0f5JoCafSGRWFO20cNXNZU4327Kk%3D"
)



In [0]:
spark.read.csv(
    "wasbs://airbnbdata@airbnbpro.blob.core.windows.net/cleaned_data_New/airbnb_data.csv",
    header=True,
    inferSchema=True,
    multiLine=True,
    escape='"'
)
# Make sure "fs.azure.sas.airbnbdata.airbnbpro.blob.core.windows.net" = "<YOUR_SAS_TOKEN>"
# is set in Spark configs!



DataFrame[listing_url: string, name: string, description: string, neighborhood_overview: string, picture_url: string, host_id: int, host_url: string, host_name: string, host_since: date, host_location: string, host_about: string, host_response_time: string, host_response_rate: string, host_acceptance_rate: string, host_is_superhost: string, host_neighbourhood: string, host_listings_count: double, host_total_listings_count: double, host_identity_verified: string, neighbourhood_cleansed: string, latitude: double, longitude: double, property_type: string, room_type: string, accommodates: int, bathrooms: double, bathrooms_text: string, bedrooms: double, beds: double, price: double, minimum_nights: int, maximum_nights: int, minimum_minimum_nights: double, maximum_minimum_nights: double, minimum_maximum_nights: double, maximum_maximum_nights: double, minimum_nights_avg_ntm: double, maximum_nights_avg_ntm: double, has_availability: string, availability_30: int, availability_60: int, availabil

In [0]:
from flask import Flask, request, jsonify
import requests  # For calling Gemini API

app = Flask(__name__)

GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent"
GEMINI_API_KEY = "AIzaSyCtIqnFZFioIC6c4Ch-tCK5MOGENBlroTs"  # Set in Azure securely, never hard-coded in production

@app.route("/recommend", methods=["POST"])
def recommend():
    data = request.json
    gender = data.get("gender", "")
    city = data.get("city", "")
    age = data.get("age", "")
    hosts = data.get("hosts", "")

    prompt = (
        f"User profile:\n"
        f"Gender: {gender}\n"
        f"City: {city}\n"
        f"Age: {age}\n"
        f"Number of Hosts: {hosts}\n"
        "You are an expert Airbnb consultant. Give a personalized recommendation about how to best participate in Airbnb as a guest or host, tailored to this profile."
    )

    # Gemini API request payload
    gemini_payload = {
        "contents": [
            {"parts": [{"text": prompt}]}
        ]
    }

    gemini_response = requests.post(
        f"{GEMINI_API_URL}?key={GEMINI_API_KEY}",
        json=gemini_payload
    )

    if gemini_response.ok:
        output = gemini_response.json()
        # Gemini structure may differ; extract text wisely
        try:
            recommendation = output["candidates"][0]["content"]["parts"][0]["text"]
        except Exception:
            recommendation = output

        return jsonify({"recommendation": recommendation})
    else:
        return jsonify({"error": "Failed to get recommendation from Gemini"}), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=True)  # Azure prefers 0.0.0.0


 * Serving Flask app '__main__'
 * Debug mode: on


Traceback (most recent call last):
  File "/databricks/python_shell/scripts/db_ipykernel_launcher.py", line 52, in <module>
    main()
  File "/databricks/python_shell/scripts/db_ipykernel_launcher.py", line 48, in main
    DatabricksKernelApp.launch_instance(config=databricks_kernel_config())
  File "/databricks/python/lib/python3.12/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/databricks/python_shell/lib/dbruntime/app.py", line 41, in initialize
    super().initialize(argv=argv)
  File "/databricks/python_shell/lib/dbruntime/app.py", line 31, in initialize
    super().initialize(argv=argv)
  File "/databricks/python/lib/python3.12/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/databricks/python/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 695, in initialize
    self.write_connection_file()
  File 

An exception has occurred, use %tb to see the full traceback.

[0;31mSystemExit[0m[0;31m:[0m 1


In [0]:
pip install dask dash

Collecting dash
  Downloading dash-3.3.0-py3-none-any.whl.metadata (11 kB)
Collecting Flask<3.2,>=1.0.4 (from dash)
  Downloading flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.2 (from dash)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting retrying (from dash)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Collecting blinker>=1.9.0 (from Flask<3.2,>=1.0.4->dash)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting itsdangerous>=2.2.0 (from Flask<3.2,>=1.0.4->dash)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Downloading dash-3.3.0-py3-none-any.whl (7.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/7.9 MB[0m [31m6.2 MB/s