In [0]:
def prepare_dataframe(data_location):
    """Read JSON data files for all pinterest posts data into a Dataframe."""
    df = spark.read.json(data_location)
    return df

In [0]:
def transform_followers_count(x) -> int:
    """Transform k with 1000 and M with 1000000 for follower count."""
    muliplier = 1
    if x.endswith('k'):
        return 1000 * int(x[:-1])
    elif x.endswith('M'):
        return 1000000 * int(x[:-1])
    else:
        return int(x)


def clean_pin_dataframe(df_pin):
    """
    Replace empty entries and entries with no relevant data in each column with `Nones` \
    Perform the necessary transformations on the follower_count to ensure every entry is a number. \
    Make sure the data type of this column is an `int`. \
    Ensure that each column containing numeric data has a numeric data type \
    Clean the data in the `save_location` column to include only the save location path \
    Rename the `index` column to `ind`. \
    Reorder the `DataFrame` columns to have the following column order: \
        `ind` \
        `unique_id` \
        `title` \
        `description` \
        `follower_count` \
        `poster_name` \
        `tag_list` \
        `is_image_or_video` \
        `image_src` \
        `save_location` \
        `category`
    """
    # uuid_regex = r'[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}'
    # pin_dfs = pin_dfs[pin_dfs['unique_id'].str.match(uuid_regex)]
    # len(pin_dfs)
    # Make sure unique ids are correct format
    df_pin = df_pin.filter(length(df_pin.unique_id) == 36)
    ### Cleanup and transform `follower_count`
    #### 1. Remove rows with invalid follower count
    follower_regex = r'[0-9]{1,}[kM]?'
    df_pin = df_pin.filter(df_pin.follower_count.rlike(follower_regex))
    #### 2. Convert *kilo* and *Million* to numeric    
    transform_followers_udf = udf(transform_followers_count, IntegerType())
    df_pin = df_pin.withColumn('follower_count', transform_followers_udf('follower_count'))
    ### Rename index column to `ind`
    df_pin = df_pin.withColumnRenamed('index', 'ind')
    ### Cleanup the `save_location` columns to keep only the path
    df_pin = df_pin.withColumn('save_location', regexp_replace('save_location', 'Local save in ', ''))
    ### Reorder the columns
    df_pin = df_pin[['ind', 'unique_id', 'title', 'description', 'follower_count', 'poster_name',
                   'tag_list', 'is_image_or_video', 'image_src', 'save_location', 'category']]
    ### Replace empty and not applicable with `None`
    df_pin = df_pin.replace(['', 'N/A', 'n/a', 'none', 'None'], None)
    ### Finally, drop duplicate rows
    df_pin = df_pin.drop_duplicates()

    return df_pin


In [0]:
def clean_geo_dataframe(df_geo):
    """
    Clean Geo Spark DataFrame
    To clean the df_geo DataFrame you should perform the following transformations:

    Create a new column `coordinates` that contains an array based on the `latitude` and `longitude` columns.\
    Drop the `latitude` and `longitude` columns from the `DataFrame`. \
    Convert the `timestamp` column from a string to a `timestamp` data type. \
    Reorder the `DataFrame` columns to have the following column order:\
    `ind`\
    `country`\
    `coordinates`\
    `timestamp`
    """
    ### Creates new column `coordinates` from `latitude` and `longitude` and drops them after creation
    df_geo = df_geo.withColumn('coordinates', array(df_geo.latitude, df_geo.longitude)).drop('latitude','longitude')
    ### Convert `timestamp` column to `timestamp` data 
    df_geo = df_geo.withColumn('timestamp', to_timestamp(df_geo.timestamp))
    ### Reorder the columns as: `['ind', 'country', 'coordinates', 'timestamp']`
    df_geo = df_geo[['ind', 'country', 'coordinates', 'timestamp']]

    return df_geo

In [0]:
def clean_user_dataframe(df_user):
    """
    Clean User Spark DataFrame
    To clean the df_user DataFrame you should perform the following transformations:

    Create a new column `user_name` that concatenates the information found in the `first_name` and `last_name` columns. \
    Drop the `first_name` and `last_name` columns from the DataFrame. \
    Convert the `date_joined` column from a `string` to a `timestamp` data type. \
    Reorder the `DataFrame` columns to have the following column order: \
    `ind`, 
    `user_name`, 
    `age`, 
    `date_joined`
    """
    df_user = df_user.withColumn('user_name', concat(df_user.first_name, df_user.last_name)).drop('first_name', 'last_name')
    df_user = df_user.withColumn('date_joined', to_timestamp(df_user.date_joined))
    df_user = df_user[['ind', 'user_name', 'age', 'date_joined']]

    return df_user