In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

google_drive_path = "/content/drive/MyDrive/Colab Notebooks/Human Signals-Final Project/data/"
# Read the three CSV files
sensor_df = pd.read_csv(google_drive_path+'all_sensor_log_features.csv')
audio_df = pd.read_csv(google_drive_path+'all_audio_features.csv')
aerial_df = pd.read_csv(google_drive_path+'all_aerial_features.csv')

# Ensure the required columns exist in each dataframe
required_columns = ['activity', 'seconds_elapsed']
for df in [sensor_df, audio_df, aerial_df]:
    assert all(col in df.columns for col in required_columns), "Missing required columns!"

# Define a function to find the closest matching row based on activity and seconds_elapsed
def find_closest(row, target_df):
    candidates = target_df[target_df['activity'] == row['activity']]
    if candidates.empty:
        return None
    closest_idx = (candidates['seconds_elapsed'] - row['seconds_elapsed']).abs().idxmin()
    return candidates.loc[closest_idx]

# Initialize lists to store the audio and aerial features
audio_features = []
aerial_features = []

# Only select the relevant columns for aerial data
aerial_columns = ['average_temperature', 'average_people_count']

# Iterate over each row in the sensor data to match audio and aerial features
for idx, row in sensor_df.iterrows():
    audio_row = find_closest(row, audio_df)
    aerial_row = find_closest(row, aerial_df)

    if audio_row is not None:
        # Simply append all columns except 'activity' and 'seconds_elapsed' from the audio dataframe
        audio_features.append(audio_row.drop(['activity', 'seconds_elapsed']).values)
    else:
        audio_features.append([None] * (audio_df.shape[1] - 2))  # Exclude 'activity' and 'seconds_elapsed'

    if aerial_row is not None:
        aerial_features.append(aerial_row[aerial_columns].values)
    else:
        aerial_features.append([None] * len(aerial_columns))

# Add the extracted features back into the sensor dataframe
audio_feature_names = [f"audio_{col}" for col in audio_df.columns if col not in ['activity', 'seconds_elapsed']]
aerial_feature_names = [f"aerial_{col}" for col in aerial_columns]

# Create dataframes for the expanded audio and aerial features
audio_df_expanded = pd.DataFrame(audio_features, columns=audio_feature_names)
aerial_df_expanded = pd.DataFrame(aerial_features, columns=aerial_feature_names)

# Merge all the dataframes into one final dataframe
final_df = pd.concat([sensor_df, audio_df_expanded, aerial_df_expanded], axis=1)

# Save the final merged dataframe to a new CSV file
final_df.to_csv(google_drive_path+'merged_features.csv', index=False)

print("✅ Merge completed! The file has been saved as 'merged_features.csv'")


✅ Merge completed! The file has been saved as 'merged_features.csv'


In [12]:
import pandas as pd

# Read the CSV file
google_drive_path = "/content/drive/MyDrive/Colab Notebooks/Human Signals-Final Project/data/"
df = pd.read_csv(google_drive_path+'merged_features.csv')

# Replace typo in 'activity' column
df['activity'] = df['activity'].replace('walking_at_7_foor', 'walking_at_7_floor')

# Save back to CSV
df.to_csv(google_drive_path+'merged_features.csv', index=False)

In [13]:
import pandas as pd


google_drive_path = "/content/drive/MyDrive/Colab Notebooks/Human Signals-Final Project/data/"
# Read the merged features CSV file
df = pd.read_csv(google_drive_path+'merged_features.csv')

# Create a dictionary to map activity to label
activity_to_label = {
    'walking': 1,
    'running': 2,
    'play_basketball_alone': 3,
    'walking_at_7_floor': 4,
    'play_basketball_with_kid': 5,
    'walking_while_using_iphone': 6,
    'danger_running': 7
}

# Add a new 'label' column based on the 'activity' column
df['label'] = df['activity'].map(activity_to_label)

# Save the updated dataframe to a new CSV
df.to_csv(google_drive_path+'merged_features_with_labels.csv', index=False)

print("✅ Labels added successfully! The file has been saved as 'merged_features_with_labels.csv'")

✅ Labels added successfully! The file has been saved as 'merged_features_with_labels.csv'


In [16]:
import pandas as pd
google_drive_path = "/content/drive/MyDrive/Colab Notebooks/Human Signals-Final Project/data/"
# Read the merged features with labels
df = pd.read_csv(google_drive_path+'balanced_merged_features.csv')

# Count the number of records for each label
label_counts = df['label'].value_counts().sort_index()

# Print the counts for each label
print("Record counts per label:")
for label, count in label_counts.items():
    print(f"Label {label}: {count} records")

# Print the total number of records
total_records = len(df)
print(f"\nTotal number of records: {total_records}")

Record counts per label:
Label 1: 2792 records
Label 2: 1233 records
Label 3: 1590 records
Label 4: 1612 records
Label 5: 3000 records
Label 6: 2499 records
Label 7: 1466 records

Total number of records: 14192


In [15]:
import pandas as pd


google_drive_path = "/content/drive/MyDrive/Colab Notebooks/Human Signals-Final Project/data/"

df = pd.read_csv(google_drive_path+'merged_features_with_labels.csv')

# Split the dataframe into two parts
label_5 = df[df['label'] == 5]
other_labels = df[df['label'] != 5]

# Randomly sample 3000 records from label 5
label_5_sampled = label_5.sample(n=3000, random_state=42)  # random_state for reproducibility

# Combine the sampled label 5 records with the other labels
balanced_df = pd.concat([label_5_sampled, other_labels], ignore_index=True)

# Shuffle the combined dataframe
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced data to a new CSV file
balanced_df.to_csv(google_drive_path+'balanced_merged_features.csv', index=False)

print(f"Original dataset: {len(df)} records")
print(f"Balanced dataset: {len(balanced_df)} records")

Original dataset: 24944 records
Balanced dataset: 14192 records
