In [0]:
def replace_empty_with_nones(df):
    # Replace entries with no relevant data in each column with Nones
    
    mapping = {
        "follower_count": "User Info Error",
        "image_src": "Image src error",
        "poster_name": "User Info Error"
    }

    for key, value in mapping.items():
        df = df.withColumn(key, F.when(F.col(key) == value, None)
                           .otherwise(F.col(key)))        

    return df

df_pin_cleaned = (
    # Drop duplicates
    df_pin_pre_cleaning.drop_duplicates([column_name for column_name, _ in df_pin_pre_cleaning.dtypes])
        .transform(replace_empty_with_nones)
        
        # Convert follower_count columns with k/M
        .withColumn("follower_count", F.when(F.col("follower_count").endswith("k"), expr("substring(follower_count, 1, length(follower_count)-1)") * 1000)
                    .otherwise(F.col("follower_count")))
        .withColumn("follower_count", F.when(F.col("follower_count").endswith("M"), expr("substring(follower_count, 1, length(follower_count)-1)") * 1000000)
                    .otherwise(F.col("follower_count")))
        
        # Clean the data in the save_location column to include only the save location path
        .withColumn("save_location", F.split("save_location", "Local save in"))
        .withColumn("save_location", F.col("save_location")[F.size("save_location") -1])

        .withColumnRenamed("index", "ind")
)

In [0]:
df_pin = (df_pin_cleaned
          .withColumn("follower_count", df_pin_cleaned["follower_count"].cast(IntegerType()))
          .withColumn("index", df_pin_cleaned["ind"].cast(IntegerType()))          
          .select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")          
) 

In [0]:
df_geo = (
    df_geo_pre_cleaning.drop_duplicates([column_name for column_name, _ in df_geo_pre_cleaning.dtypes])
    .withColumn("coordinates", F.array("latitude", "longitude"))
    .withColumn("timestamp", F.to_timestamp("timestamp"))
    .select("ind", "country", "coordinates", "timestamp")
)

In [0]:
df_user = (
     df_user_pre_cleaning.drop_duplicates([column_name for column_name, _ in df_user_pre_cleaning.dtypes])
     .withColumn("user_name", F.concat(F.col("first_name"), F.lit(" "), F.col("last_name")))
     .withColumn("date_joined", F.to_timestamp("date_joined"))
     .select("ind", "user_name", "age", "date_joined")    
)