In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split,exp, explode
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


spark_home="C:/Users/omar/Downloads/spark_unzipped/spark-3.5.1-bin-hadoop3"

In [None]:
#environment settings

os.environ["SPARK_HOME"] = spark_home

# Add Spark bin and executors to PATH
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "bin")
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "sbin")

# Add Spark Python libraries to PYTHONPATH
os.environ["PYTHONPATH"] = os.path.join(spark_home, "python") + os.pathsep + os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] += os.pathsep + os.path.join(spark_home, "python", "lib")

# Add PySpark to the system path
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "python", "lib", "pyspark.zip")
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "python", "lib", "py4j-0.10.9-src.zip")

os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [None]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("PySpark-Script") \
    .getOrCreate()


In [None]:
#read the data
df = spark.read.parquet("Dataset/Airbnb_Data.parquet")

#show the data
df.show()

#show the schema
df.printSchema()

#show the count of the data
print("The count of the data is: ", df.count())



In [None]:


#for all the amenities remove the brackets and double quotes and split them by comma to get the list of amenities

df = df.withColumn("amenities", regexp_replace(col("amenities"), "[{}\"]", ""))
df = df.withColumn("amenities", split(col("amenities"), ","))
df.show(5)
#print first ammenity
print(df.select("amenities").first()[0])
df.show(5)







In [None]:
#replace the log price column with the price column

df = df.withColumn("price", exp(df["log_price"]))
df.show(5)

newdf=df


In [None]:

# Explode the 'amenities' array column to get individual amenities
amenities_df = df.select(explode('amenities').alias('amenity'))

# Get unique amenities
unique_amenities_df = amenities_df.select('amenity').distinct()

# Convert DataFrame to a list of rows
unique_amenities_rows = unique_amenities_df.collect()

# Convert list of rows to a set of unique amenities
unique_amenities_set = set(row.amenity for row in unique_amenities_rows)

# Display the unique amenities
print(unique_amenities_set)

In [None]:
print("The number of unique amenities is: ", len(unique_amenities_set))
unique_amenities_set.remove('')
print("The number of unique amenities is: ", len(unique_amenities_set))

In [None]:
# Get unique integer values of prices in the DataFrame
prices_df = df.select('price').distinct()

# Convert DataFrame to a list of rows
prices_rows = prices_df.collect()

# Extract unique integer values of prices
prices = set(int(row.price) for row in prices_rows)

# Display the unique prices
print(prices)

In [None]:
print("The number of unique prices is: ", len(prices))
prices = [(0 + 50 * i, 50 + 50 * i) for i in range(0, 40)]

print(prices)

In [None]:
# Get unique cities
unique_city_df = df.select('city').distinct()

# Convert DataFrame to a list of cities
unique_cities = [row.city for row in unique_city_df.collect()]

price_categories = [i for i in range(0, 41)]

print(price_categories)
print(unique_cities)


In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

csv_file_path = "Dataset/Airbnb_Data.csv" 
df = pd.read_csv(csv_file_path)

table = pa.Table.from_pandas(df)

parquet_file_path = "Dataset/Airbnb_Data.parquet"           
pq.write_table(table, parquet_file_path)

