In [0]:
import pandas as pd
import os

In [0]:
parquet_folder = "/dbfs/FileStore/cleanParquet/" 

merged_df = pd.DataFrame()

parquet_files = [
    os.path.join(parquet_folder, file)
    for file in os.listdir(parquet_folder)
    if file.endswith(".parquet")
]

In [0]:
parquet_files

In [0]:
for file in parquet_files:
    game_name = os.path.splitext(os.path.basename(file))[0]
    file_df = pd.read_parquet(file)

    game_name_list = [game_name] * len(file_df)

    file_df["game_name"] = game_name_list

    # Save the modified DataFrame back to the same Parquet file
    file_df.to_parquet(file, index=False)

In [0]:
dfs = [pd.read_parquet(file_path) for file_path in parquet_files]

# Identify common columns
common_columns = [
    "recommendationid",
    "review",
    "timestamp_created",
    "timestamp_updated",
    "voted_up",
    "votes_up",
    "votes_funny",
    "weighted_vote_score",
    "comment_count",
    "steam_purchase",
    "received_for_free",
    "written_during_early_access",
    "steamid",
    "num_games_owned",
    "num_reviews",
    "last_played",
]

# Add 'author_' columns
author_columns = [
    "author_steamid",
    "author_num_games_owned",
    "author_num_reviews",
    "author_playtime_forever",
    "author_playtime_last_two_weeks",
    "author_playtime_at_review",
    "author_last_played",
]

In [0]:
for df in dfs:
    for author_col in author_columns:
        new_col_name = author_col.replace("author_", "")
        if author_col in df.columns:
            df[new_col_name] = df[author_col]
        df.drop(columns=[author_col], errors="ignore", inplace=True)

# Concatenate DataFrames based on the common set of columns
merged_df = pd.concat(dfs, ignore_index=True)

# Display the merged DataFrame
merged_df.head()

In [0]:
merged_df["game_name"] = merged_df["game_name"].replace(
    {
        "rust_clean": "Rust",
        "phasmophobia_clean": "Phasmophobia",
        "stardew_valley_pos_clean": "Stardew Valley",
        "fallout4_clean": "Fallout 4",
        "dead_by_daylight_clean": "Dead by Daylight",
        "sea_of_thieves_clean": "Sea of Thieves",
        "No_Man's_Sky_clean": "No Man's Sky",
        "stardew_valley_neg_clean": "Stardew Valley",  # Assuming positive and negative share the same name
    }
)

# Display the modified DataFrame
print(merged_df)

In [0]:
merged_df = merged_df.drop(["timestamp_dev_responded", "developer_response"], axis=1)

In [0]:
merged_df["playtime_forever"] = (merged_df["playtime_forever"] / 60).round(2)
merged_df["playtime_at_review"] = (merged_df["playtime_at_review"] / 60).round(2)

In [0]:
folder_path = "/dbfs/FileStore/finalParquet/"
file_path = os.path.join(folder_path, "final_parquet_test.parquet")

os.makedirs(folder_path, exist_ok=True)

merged_df.to_parquet(file_path, index=False)