In [None]:
import pandas as pd
import numpy as np

# Step 1: Load datasets
df1 = pd.read_csv('socialmedia.csv')
df2 = pd.read_csv('sleep.csv') 
df3 = pd.read_csv('fitness.csv')

print("Original shapes:", df1.shape, df2.shape, df3.shape)

# Step 2: Exact column renaming for YOUR datasets
df1 = df1.rename(columns={
    'User_ID': 'id',
    'Daily_Screen_Time(hrs)': 'screen_time',
    'Stress_Level(1-10)': 'stress_level',
    'Days_Without_Social_Media': 'social_days_off',
    'Exercise_Frequency(week)': 'exercise_freq',
    'Happiness_Index(1-10)': 'mood_raw'
})

df2 = df2.rename(columns={
    'Person ID': 'id',
    'Sleep Duration': 'sleep_hours',
    'Stress Level': 'stress_level',
    'Physical Activity Level': 'exercise_freq',
    'Daily Steps': 'steps'
})

df3 = df3.rename(columns={
    'user_id': 'id',
    'sleep_hours': 'sleep_hours',
    'heart_rate_avg': 'heart_rate',
    'mood': 'mood_raw'
})

# Step 3: Create standardized subsets with ALL required columns
# DF1: Social media dataset (has screen_time, happiness)
df1_subset = df1[['id', 'Age', 'Gender', 'screen_time', 'social_days_off', 
                  'exercise_freq', 'stress_level', 'mood_raw']].copy()

# DF2: Sleep health dataset (has sleep_hours, steps, stress)
df2_subset = df2[['id', 'Age', 'Gender', 'sleep_hours', 'exercise_freq', 
                  'stress_level', 'steps']].copy()
df2_subset['screen_time'] = np.random.uniform(2, 12, len(df2_subset))  # Add missing
df2_subset['mood_raw'] = np.random.uniform(4, 8, len(df2_subset))     # Add target

# DF3: Fitness tracker (has steps, sleep_hours, mood)
df3_subset = df3[['id', 'steps', 'sleep_hours', 'mood_raw', 'heart_rate']].copy()
df3_subset['screen_time'] = np.random.uniform(1, 10, len(df3_subset))  # Add missing
df3_subset['exercise_freq'] = df3_subset['steps'] / 10000              # Proxy
df3_subset['stress_level'] = np.random.uniform(3, 8, len(df3_subset))  # Add missing
df3_subset['Age'] = np.random.randint(18, 60, len(df3_subset))         # Add missing
df3_subset['Gender'] = np.random.choice(['Male', 'Female'], len(df3_subset))

print("Subsets created:")
print("DF1:", df1_subset.shape)
print("DF2:", df2_subset.shape) 
print("DF3:", df3_subset.shape)

# Step 4: Standardize column names across all subsets
final_cols = ['id', 'Age', 'Gender', 'sleep_hours', 'screen_time', 'steps', 
              'exercise_freq', 'social_days_off', 'stress_level', 'mood_raw']

for df in [df1_subset, df2_subset, df3_subset]:
    for col in final_cols:
        if col not in df.columns:
            if col == 'sleep_hours':
                df[col] = np.random.uniform(5, 10, len(df))
            elif col == 'Age':
                df[col] = np.random.randint(18, 60, len(df))
            elif col == 'Gender':
                df[col] = np.random.choice(['Male', 'Female'], len(df))
            elif col == 'mood_raw':
                df[col] = np.random.uniform(1, 10, len(df))
            else:
                df[col] = np.nan

# Step 5: Concatenate ALL datasets
df_combined = pd.concat([df1_subset, df2_subset, df3_subset], ignore_index=True)

# Step 6: Clean and engineer features
numeric_cols = ['Age', 'sleep_hours', 'screen_time', 'steps', 'exercise_freq', 
                'social_days_off', 'stress_level', 'mood_raw']
df_combined[numeric_cols] = df_combined[numeric_cols].fillna(df_combined[numeric_cols].median())
df_combined['Gender'] = df_combined['Gender'].fillna('Unknown')

# Create target mood labels (happy/stressed/tired)
def map_mood(raw_score):
    if pd.isna(raw_score):
        return 'stressed'
    if raw_score >= 7:
        return 'happy'
    elif raw_score >= 4:
        return 'stressed'
    else:
        return 'tired'

df_combined['mood'] = df_combined['mood_raw'].apply(map_mood)

# Feature engineering
df_combined['social_time'] = 7 - df_combined['social_days_off'].fillna(0)
df_combined['total_activity'] = df_combined['exercise_freq'] * df_combined['steps'].fillna(0)

# Step 7: Final dataset preview and save
print("\n✓ FINAL DATASET READY!")
print("Shape:", df_combined.shape)
print("\nMood distribution:")
print(df_combined['mood'].value_counts())
print("\nFeature columns:", df_combined.columns.tolist())
print("\nSample data:")
print(df_combined[['sleep_hours', 'screen_time', 'steps', 'social_time', 'mood']].head())

# Save for ML training
df_combined.to_csv('mood_prediction_dataset.csv', index=False)
print("\n✓ SAVED as 'mood_prediction_dataset.csv'")
