# Raw Data Consolidation


The raw data used in this project comes from two separate folders, each covering a specific date range of Fitbit usage:

mturkfitbit_export_3.12.16-4.11.16 → Data from Mar 11, 2016 to Apr 11, 2016

mturkfitbit_export_4.12.16-5.12.16 → Data from Apr 12, 2016 to May 12, 2016

To ensure a continuous and consistent dataset, the consolidation process includes #two main steps:

# Libraries Used

In [9]:
import pandas as pd
import os

# First step : Append Similar Files (Same Format)


Files with the same name and identical structure across the two folders were identified and appended together.
This step was done because each pair of files represented the same data type but covered different time periods — with one file continuing from where the other left off.
Combining them ensures complete time-series without any overlap or loss of continuity.

In [10]:
import pandas as pd
import os

folder1 = 'mturkfitbit_export_3.12.16-4.11.16'
folder2 = 'mturkfitbit_export_4.12.16-5.12.16'
output_folder = 'appended_data'

# These files have matching structures and represent consecutive time periods,
# so they are appended to create a full continuous dataset.
file_pairs = [
    ('activity_daily_full_2016_v01.csv', 'activity_daily_full_2016_v01.csv'),
    ('calories_hourly_2016_v01.csv', 'calories_per_hour_2016_v01.csv'),
    ('calories_per_minute_narrow_2016_v01.csv', 'calories_per_minute_narrow_2016_v01.csv'),
    ('heartrate_per_second_2016_v01.csv', 'heartrate_per_second_2016_v01.csv'),
    ('intensity_hourly_2016_v01.csv', 'intensity_per_hour_2016_v01.csv'),
    ('intensity_per_minute_narrow_2016_v01.csv', 'intensity_per_minute_narrow_2016_v01.csv'),
    ('mets_per_minute_narrow_2016_v01.csv', 'mets_per_minute_narrow_2016_v01.csv'),
    ('sleep_per_minute_2016_v01.csv', 'sleep_per_minute_log_2016_v01.csv'),
    ('steps_hourly_2016_v01.csv', 'steps_per_hour_2016_v01.csv'),
    ('steps_per_minute_narrow_2016_v01.csv', 'steps_per_minute_narrow_2016_v01.csv'),
    ('weight_log_full_2016_v01.csv', 'weight_log_summary_2016_v01.csv'),
]


for file1, file2 in file_pairs:
    try:
        df1 = pd.read_csv(os.path.join(folder1, file1))
        df2 = pd.read_csv(os.path.join(folder2, file2))
        combined = pd.concat([df1, df2], ignore_index=True)

        # Drop duplicate rows if any
        combined.drop_duplicates(inplace=True)

        output_name = file1.replace('_2016_v01.csv', '_appended.csv')
        combined.to_csv(os.path.join(output_folder, output_name), index=False)
        print(f"{output_name} saved.")
    except Exception as e:
        print(f"Error with {file1} & {file2}: {e}")


activity_daily_full_appended.csv saved.
calories_hourly_appended.csv saved.
calories_per_minute_narrow_appended.csv saved.
heartrate_per_second_appended.csv saved.
intensity_hourly_appended.csv saved.
intensity_per_minute_narrow_appended.csv saved.
mets_per_minute_narrow_appended.csv saved.
sleep_per_minute_appended.csv saved.
steps_hourly_appended.csv saved.
steps_per_minute_narrow_appended.csv saved.
weight_log_full_appended.csv saved.


#  second step : Standardize Format (Wide to Narrow)  and then append

This step was necessary because the wide-format files often contained extended time coverage beyond what was available in their narrow-format counterparts. Converting them to the narrow format allowed for consistent structure, making it possible to append the additional records and ensure complete temporal continuity across datasets.

# first step : standardize Format (Wide to Narrow) 
there is three :
(calories_per_minute_wide_2016_v01.csv , intensity_per_minute_wide_2016_v01.csv ,steps_per_minute_wide_2016_v01.csv ) 
files with wide format we will convert it to narrow format  

In [13]:
output_folder = "after_convert"
# Load the wide-format calories data

df_cal = pd.read_csv(os.path.join(folder_path, "calories_per_minute_wide_2016_v01.csv"))

# Unpivot the minute columns into a single 'Minute' and 'Calories' column
df_cal_narrow = pd.melt(df_cal,
                        id_vars=["Id", "ActivityHour"],
                        var_name="Minute",
                        value_name="Calories")

# Extract minute number from column names like 'Minute_0' → 0
df_cal_narrow["Minute"] = df_cal_narrow["Minute"].str.extract(r"(\d+)").astype(int)

# Convert the 'ActivityHour' column from string to datetime format
df_cal_narrow["ActivityHour"] = pd.to_datetime(df_cal_narrow["ActivityHour"], format="%m/%d/%Y %I:%M:%S %p")

# Add the 'Minute' offset to the base hour to get the exact timestamp for each record
df_cal_narrow["ActivityMinute"] = df_cal_narrow["ActivityHour"] + pd.to_timedelta(df_cal_narrow["Minute"], unit="minute")

# Keep only the relevant columns
df_cal_narrow = df_cal_narrow[["Id", "ActivityMinute", "Calories"]]

# Save the cleaned and converted dataset to CSV
df_cal_narrow.to_csv(os.path.join(output_folder, "after_convert_calories_per_minute.csv"), index=False)


In [15]:
output_folder = "after_convert"

# Load the wide-format intensity data
df_int = pd.read_csv(os.path.join(folder_path, "intensity_per_minute_wide_2016_v01.csv"))

# Unpivot the minute columns into a single 'Minute' and 'Intensity' column
df_int_narrow = pd.melt(df_int,
                        id_vars=["Id", "ActivityHour"],
                        var_name="Minute",
                        value_name="Intensity")

# Extract minute number
df_int_narrow["Minute"] = df_int_narrow["Minute"].str.extract(r"(\d+)").astype(int)

# Convert base hour to datetime
df_int_narrow["ActivityHour"] = pd.to_datetime(df_int_narrow["ActivityHour"], format="%m/%d/%Y %I:%M:%S %p")

# Compute the full timestamp per minute
df_int_narrow["ActivityMinute"] = df_int_narrow["ActivityHour"] + pd.to_timedelta(df_int_narrow["Minute"], unit="minute")

# Keep relevant columns
df_int_narrow = df_int_narrow[["Id", "ActivityMinute", "Intensity"]]

# Save the converted dataset
df_int_narrow.to_csv(os.path.join(output_folder, "after_convert_intensity_per_minute.csv"), index=False)


In [16]:
output_folder = "after_convert"

# Load the wide-format steps data
df_steps = pd.read_csv(os.path.join(folder_path, "steps_per_minute_wide_2016_v01.csv"))

# Unpivot the minute columns into a single 'Minute' and 'Steps' column
df_steps_narrow = pd.melt(df_steps,
                          id_vars=["Id", "ActivityHour"],
                          var_name="Minute",
                          value_name="Steps")

# Extract minute number
df_steps_narrow["Minute"] = df_steps_narrow["Minute"].str.extract(r"(\d+)").astype(int)

# Convert base hour to datetime
df_steps_narrow["ActivityHour"] = pd.to_datetime(df_steps_narrow["ActivityHour"], format="%m/%d/%Y %I:%M:%S %p")

# Compute the full timestamp per minute
df_steps_narrow["ActivityMinute"] = df_steps_narrow["ActivityHour"] + pd.to_timedelta(df_steps_narrow["Minute"], unit="minute")

# Keep relevant columns
df_steps_narrow = df_steps_narrow[["Id", "ActivityMinute", "Steps"]]

# Save the converted dataset
df_steps_narrow.to_csv(os.path.join(output_folder, "after_convert_steps_per_minute.csv"), index=False)


# second step  : append files   
Merge Wide-Converted and Old Narrow Files  
This step combines the new wide-format data (after conversion) with existing narrow-format files to ensure complete time coverage for each metric.


In [19]:
# Define folder paths
appended_folder = "appended_data"           # Folder containing previously appended narrow-format data
converted_folder = "after_convert"          # Folder containing newly converted narrow-format data
final_folder = "final_preparing_data"       # Folder to save the final combined datasets

# Create the final output folder if it doesn't exist
os.makedirs(final_folder, exist_ok=True)

# ======================== Merge Final Calories Data ========================
# Load previously appended calories data
calories_old = pd.read_csv(os.path.join(appended_folder, "calories_per_minute_narrow_appended.csv"))

# Load newly converted calories data
calories_new = pd.read_csv(os.path.join(converted_folder, "after_convert_calories_per_minute.csv"))

# Combine old and new calories data
calories_final = pd.concat([calories_old, calories_new], ignore_index=True)

# Remove duplicates
calories_final = calories_final.drop_duplicates()

# Sort by Id and ActivityMinute
calories_final = calories_final.sort_values(by=["Id", "ActivityMinute"])

# Save the final cleaned dataset
calories_final.to_csv(os.path.join(final_folder, "final_calories_per_minute.csv"), index=False)

# ======================== Merge Final Intensity Data ========================
# Load previously appended intensity data
intensity_old = pd.read_csv(os.path.join(appended_folder, "intensity_per_minute_narrow_appended.csv"))

# Load newly converted intensity data
intensity_new = pd.read_csv(os.path.join(converted_folder, "after_convert_intensity_per_minute.csv"))

# Combine old and new intensity data
intensity_final = pd.concat([intensity_old, intensity_new], ignore_index=True)

# Remove duplicates
intensity_final = intensity_final.drop_duplicates()

# Sort by Id and ActivityMinute
intensity_final = intensity_final.sort_values(by=["Id", "ActivityMinute"])

# Save the final cleaned dataset
intensity_final.to_csv(os.path.join(final_folder, "final_intensity_per_minute.csv"), index=False)

# ======================== Merge Final Steps Data ========================
# Load previously appended steps data
steps_old = pd.read_csv(os.path.join(appended_folder, "steps_per_minute_narrow_appended.csv"))

# Load newly converted steps data
steps_new = pd.read_csv(os.path.join(converted_folder, "after_convert_steps_per_minute.csv"))

# Combine old and new steps data
steps_final = pd.concat([steps_old, steps_new], ignore_index=True)

# Remove duplicates
steps_final = steps_final.drop_duplicates()

# Sort by Id and ActivityMinute
steps_final = steps_final.sort_values(by=["Id", "ActivityMinute"])

# Save the final cleaned dataset
steps_final.to_csv(os.path.join(final_folder, "final_steps_per_minute.csv"), index=False)
