In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when,  regexp_replace, udf, expr
import pandas as pd
import random

# Initialize Spark Session
spark = SparkSession.builder.appName("Databricks_Catalog_Read").getOrCreate()

# Define the catalog and schema
catalog_name = "workspace"
schema_name = "default"

# Read data from Databricks tables and convert them to Pandas DataFrames

table_full_path = f"{catalog_name}.{schema_name}.{'pinterest_data'}"
    
# Read the table using Spark
df_pin = spark.read.table(table_full_path)  

In [0]:
display(df_pin)

In [0]:
# Define patterns to replace with None
replace_patterns = [
    (".*User Info Error.*", None),
    (".*N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e.*", None),
    (".*No description available Story format.*", None),
    (".*No Title Data Available.*", None),
    (".*Image src error.*", None),
    (".*No description available.*",None)
]

# Iterate over all string columns and apply batch replace
for column in df_pin.columns:
    # Only process string columns
    if dict(df_pin.dtypes)[column] in ["string"]:
        df_pin = df_pin.withColumn(column, when(col(column) == "", None).otherwise(col(column)))  # Replace empty strings with None

        for pattern, replacement in replace_patterns:
            df_pin = df_pin.withColumn(column, when(col(column).rlike(pattern), None).otherwise(col(column)))


#Dropping rows with any null values
df_pin = df_pin.dropna(how="any")

# Remove "Local save in " from the 'save_location' column
df_pin = df_pin.withColumn("save_location", regexp_replace(col("save_location"), "Local save in ", ""))

#Add 
df_pin = df_pin.withColumn(
    "follower_count",
    when(col("follower_count").endswith("k"), regexp_replace(col("follower_count"), "k", "000"))
    .when(col("follower_count").endswith("M"), regexp_replace(col("follower_count"), "M", "000000"))
    .otherwise(col("follower_count"))
)

# Convert the column to integer type
df_pin = df_pin.withColumn("follower_count", col("follower_count").cast("int"))

# Rename 'index' column to 'ind'
df_pin = df_pin.withColumnRenamed("index", "ind")

# Reorder columns in the specified order
df_pin = df_pin.select("ind", 
                 "unique_id", 
                 "title", 
                 "description", 
                 "follower_count", 
                 "poster_name", 
                 "tag_list", 
                 "is_image_or_video", 
                 "image_src", 
                 "save_location", 
                 "category")



#Cast df_pin as string
df_pin = df_pin.withColumn("ind", col("ind").cast("bigint"))

# Show the result
display(df_pin)


In [0]:
%python
# Drop the existing table if it exists
spark.sql("DROP TABLE IF EXISTS workspace.default.df_pin")

# Save Spark DataFrame as managed delta table
df_pin.write.mode("overwrite").saveAsTable("workspace.default.df_pin")