In [None]:
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace

display(df_pin)

In [None]:
def add_nulls_to_dataframe_column(dataframe, column, value_to_replace):
    '''
    Converts matched values in the specified column of the DataFrame to null based on the provided expression.

    Parameters:
    - dataframe (DataFrame): The PySpark DataFrame to be modified.
    - column (str): The name of the column in which values will be replaced with null.
    - value_to_replace (str): The expression to identify values in the column that should be replaced with null.

    Returns:
    - DataFrame: The modified DataFrame with specified values replaced by null.
    '''
    dataframe = dataframe.withColumn(column, when(col(column).like(value_to_replace), None).otherwise(col(column)))
    return dataframe

In [None]:
"""
The code performs a series of data cleaning and transformation operations on the DataFrame df_pin.

1. Replace empty entries and entries with no relevant data in specific columns with None.
   - Columns and values for replacement are defined in the dictionary columns_and_values_for_null.

2. Perform necessary transformations on the 'follower_count' column to ensure every entry is a number.
   - Replace 'k' with '000' and 'M' with '000000'.

3. Cast selected numeric columns to the correct data type ('double').
   - Numeric columns are specified in the list numeric_columns.

4. Modify the 'save_location' column to include only the saved location path.
   - Remove the prefix 'Local save in '.

5. Rename the 'index' column to 'ind'.

6. Reorder the DataFrame columns to the desired sequence specified in the new_order list.

7. Display the changes using df_pin.show().

Note: The add_nulls_to_dataframe_column function is assumed to be defined elsewhere in the codebase.
"""
columns_and_values_for_null = {
    "description": "No description available%",
    "follower_count": "User Info Error",
    "image_src": "Image src error.",
    "poster_name": "User Info Error",
    "tag_list": "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",
    "title": "No Title Data Available"
}

# loop through dictionary, calling function with dictionary values as arguments
for key, value in columns_and_values_for_null.items():
    df_pin = add_nulls_to_dataframe_column(df_pin, key, value)
# Perform the necessary transformations on the follower_count to ensure every entry is a number
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
# Define a list of numeric column names
numeric_columns = ["age", "downloaded", "follower_count", "index"]
# Cast numeric columns to the correct data type
for column in numeric_columns:
    df_pin = df_pin.withColumn(column, col(column).cast("double"))
# The save_llocation column to include only the saved location path.
df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
# Rename the index column to ind
df_pin = df_pin.withColumnRenamed("index", "ind")
# Desired new world order:
new_order = ["ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category"]
# Enforce the new world order:
df_pin = df_pin.select(*[col(column) for column in new_order])
# display changes
df_pin.show(truncate=False)




In [None]:
from pyspark.sql.types import ArrayType, DoubleType

"""
    Process the DataFrame for geographical data.
    
    Args:
    - df_geo (DataFrame): Input DataFrame with geographical data.

    Returns:
    - DataFrame: Processed DataFrame with the following modifications:
      1. Created a new column "coordinates" with an array of latitude and longitude.
      2. Dropped the "latitude" and "longitude" columns.
      3. Converted the "timestamp" column into a timestamp data type.
      4. A new column order with "ind", "country", "coordinates", and "timestamp".
    """
# Create a new column coordinates with an array of latitude and longitude
df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
# Drop the longitude and latitude columns
df_geo = df_geo.drop("latitude", "longitude")
# Conver the timestamp column into timestamp data type
df_geo.withColumn("timestamp", to_timestamp("timestamp"))
# Desired new world order:
new_order = ["ind", "country", "coordinates", "timestamp"]
# Enforce the new world order:
df_geo = df_geo.select(*[col(column) for column in new_order])
# Show the updated DataFrame
df_geo.show(truncate=False)
# display changes
df_geo.printSchema()


In [None]:
"""
Transforms the user DataFrame with the following steps:

1. Create a new column for the full name by concatenating 'first_name' and 'last_name'.
2. Drop the 'first_name' and 'last_name' columns.
3. Convert the 'date_joined' column to timestamp data type.
4. Define the desired new column order.
5. Reorder the DataFrame columns to enforce the new order.
6. Display the updated DataFrame.
7. Print the schema changes.

Parameters:
- df_user (DataFrame): The original user DataFrame.

Returns:
- df_user (DataFrame): The transformed user DataFrame.
"""
# create a column for the full name
df_user = df_user.withColumn("user_name", concat_ws(" ", "first_name", "last_name"))
# Drop the longitude and latitude columns
df_user = df_user.drop("first_name", "last_name")
# Conver the timestamp column into timestamp data type
df_user.withColumn("date_joined", to_timestamp("date_joined"))
# Desired new world order:
new_order = ["ind", "user_name", "age", "date_joined"]
# Enforce the new world order:
df_user = df_user.select(*[col(column) for column in new_order])
# Show the updated DataFrame
df_user.show(truncate=False)
# display changes
df_user.printSchema()