In [0]:

configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": "3afd56df-5d3f-47af-b2df-ebbc344518f3",
    "fs.azure.account.oauth2.client.secret": "xCn8Q~YhRBA1oi0fdnYvSNZn6~2svnwxahKzucxj",        # or use dbutils.secrets.get(...)
    "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/5950e39e-81d1-45a1-8618-fe39a39b0448/oauth2/token"
}

# Optionally include <directory-name> in the source URI if using hierarchical dirs.
source_uri = "abfss://dbyellotaxisource@pavansa28.dfs.core.windows.net/"
mount_point = "/mnt/mnypoint"


In [0]:

# If already mounted, unmount (optional)
try:
    if any(m.mountPoint == mount_point for m in dbutils.fs.mounts()):
        dbutils.fs.unmount(mount_point)
except Exception as e:
    print(f"Unmount check: {e}")

# Mount
dbutils.fs.mount(
    source=source_uri,
    mount_point=mount_point,
    extra_configs=configs
)  # Mount command commented out due to environment restrictions
# Mount command commented out due to environment restrictions

# COMMAND ----------

try:    display(dbutils.fs.ls(mount_point))
except Exception as e:
    print(f"Listing failed: {e}")




In [0]:
display(dbutils.fs.ls("mnt/DatalakeGen2/"))


In [0]:
file_path = "abfss://dbyellotaxisource@pavansa28.dfs.core.windows.net/YellowTaxiTripData.csv"
 
# Set Spark configs for Azure authentication
spark.conf.set("fs.azure.account.auth.type", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id", "3afd56df-5d3f-47af-b2df-ebbc344518f3")
spark.conf.set("fs.azure.account.oauth2.client.secret", "xCn8Q~YhRBA1oi0fdnYvSNZn6~2svnwxahKzucxj")
spark.conf.set("fs.azure.account.oauth2.client.endpoint", "https://login.microsoftonline.com/5950e39e-81d1-45a1-8618-fe39a39b0448/oauth2/token")
 
yellowTaxiTripDataDF = spark.read.csv(file_path, header=True, inferSchema=True)

In [0]:
display(yellowTaxiTripDataDF)


In [0]:
# COMMAND ----------
display(
    yellowTaxiTripDataDF.describe(
        "passenger_count",
        "trip_distance"
    )
)

In [0]:
# COMMAND ----------
print("Before filter =", yellowTaxiTripDataDF.count())

# Filter inaccurate data
yellowTaxiTripDataDF = (
    yellowTaxiTripDataDF
      .where("passenger_count > 0")
      .filter(F.col("trip_distance") > 0.0)
)

print("After filter =", yellowTaxiTripDataDF.count())

# COMMAND ----------
print("Before filter =", yellowTaxiTripDataDF.count())


In [0]:

# Drop rows with nulls in pickup/drop columns.
# Your Scala used both "PULocationID"/"DOLocationID" and "PUlocationID"/"DOlocationID".
# Handle either casing just in case.
cols = set(yellowTaxiTripDataDF.columns)

subset_cols = []
if "PULocationID" in cols and "DOLocationID" in cols:
    subset_cols = ["PULocationID", "DOLocationID"]
elif "PUlocationID" in cols and "DOlocationID" in cols:
    subset_cols = ["PUlocationID", "DOlocationID"]
else:
    # Fallback: try whatever similar columns exist
    subset_cols = [c for c in ["PULocationID", "DOLocationID", "PUlocationID", "DOlocationID"] if c in cols]

yellowTaxiTripDataDF = yellowTaxiTripDataDF.na.drop(subset=subset_cols)

print("After filter =", yellowTaxiTripDataDF.count())

In [0]:

# COMMAND ----------
# MAGIC %md ### Section 4: Transformation

# COMMAND ----------
# Rename columns per your Scala:
#   PUlocationID  -> PickupLocationId
#   DOlocationID  -> DropLocationId
# If the uppercase versions exist, rename those too for safety.

rename_map = {
    "PUlocationID": "PickupLocationId",
    "PULocationID": "PickupLocationId",
    "DOlocationID": "DropLocationId",
    "DOLocationID": "DropLocationId",
}

for old, new in rename_map.items():
    if old in yellowTaxiTripDataDF.columns:
        yellowTaxiTripDataDF = yellowTaxiTripDataDF.withColumnRenamed(old, new)

yellowTaxiTripDataDF.printSchema()


In [0]:

# COMMAND ----------
# MAGIC %md ### Section 5: Loading Data

# COMMAND ----------
# Save as CSV
(yellowTaxiTripDataDF
    .write
    .option("header", "true")
    .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    .mode("overwrite")
    .csv("/mnt/DatalakeGen2/ProcessedTaxiData/YellowTaxiData.csv")
)


In [0]:

# COMMAND ----------
# Save as Parquet
(yellowTaxiTripDataDF
    .write
    .option("header", "true")  # header is ignored for Parquet but harmless
    .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    .mode("overwrite")
    .parquet("abfss://dbyellowtaxitarget@pavansa28.dfs.core.windows.net/YellowTaxiData.parquet")
)
