In [8]:
import pandas as pd
from datetime import datetime

# --------- Load Dataset 1 ---------
df1 = pd.read_csv("data/seattle-weather.csv")

# --------- Load Dataset 2 ---------
df2 = pd.read_csv("data/weather_classification_data.csv")  # or .csv depending on format

# --------- Load Dataset 3 ---------
df3 = pd.read_csv("data/Titanic-Dataset.csv")  # or .csv depending on format

# --------- Load Dataset 4 ---------
df4 = pd.read_csv("data/healthcare_dataset.csv")  # or .csv depending on format


# --------- Preprocess Dataset 1 ---------
def preprocess_dataset1(df):
    print("🔧 Preprocessing Dataset 1 (sample_data.csv)...")
    df = df.drop_duplicates().dropna()

    # Fahrenheit to Celsius
    df['temp_max'] = (df['temp_max'] - 32) * 5.0 / 9.0
    df['temp_min'] = (df['temp_min'] - 32) * 5.0 / 9.0

    # Normalize date to ISO 8601
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    df['date'] = df['date'].dt.strftime('%Y-%m-%dT00:00:00Z')

    # Feature Engineering: avg_temp + weather impact score
    df['avg_temp'] = (df['temp_max'] + df['temp_min']) / 2

    df['norm_temp'] = (df['avg_temp'] - df['avg_temp'].min()) / (df['avg_temp'].max() - df['avg_temp'].min())
    df['norm_precip'] = (df['precipitation'] - df['precipitation'].min()) / (df['precipitation'].max() - df['precipitation'].min())
    df['norm_wind'] = (df['wind'] - df['wind'].min()) / (df['wind'].max() - df['wind'].min())

    df['weather_impact_score'] = (
        df['norm_precip'] * 0.4 +
        df['norm_wind'] * 0.3 +
        df['norm_temp'] * 0.3
    )

    return df


# --------- Preprocess Dataset 2 ---------
def preprocess_dataset2(df):
    print("🔧 Preprocessing Dataset 2 (sample_weather.json)...")
    df = df.drop_duplicates().dropna()

    # Fabricate a date column if missing
    if 'date' not in df.columns:
        df['date'] = pd.date_range(start='2022-01-01', periods=len(df), freq='D')
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    df['date'] = df['date'].dt.strftime('%Y-%m-%dT00:00:00Z')

    # Feature Engineering: Create avg_temp if not present
    if 'Temperature' in df.columns:
        df['avg_temp'] = df['Temperature']
    else:
        df['avg_temp'] = (df['temp_max'] + df['temp_min']) / 2

    df['norm_temp'] = (df['avg_temp'] - df['avg_temp'].min()) / (df['avg_temp'].max() - df['avg_temp'].min())
    df['norm_precip'] = (df['Precipitation (%)'] - df['Precipitation (%)'].min()) / (df['Precipitation (%)'].max() - df['Precipitation (%)'].min())
    df['norm_wind'] = (df['Wind Speed'] - df['Wind Speed'].min()) / (df['Wind Speed'].max() - df['Wind Speed'].min())

    df['weather_impact_score'] = (
        df['norm_precip'] * 0.4 +
        df['norm_wind'] * 0.3 +
        df['norm_temp'] * 0.3
    )

    return df

# --------- Dataset 3 Preprocessing (Titanic) ---------
def preprocess_dataset3(df):
    print("🔧 Preprocessing Dataset 3 (titanic_data.csv)...")
    df = df.drop_duplicates()

    # Handle missing values
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Cabin'] = df['Cabin'].fillna("Unknown")
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # Convert column types if needed
    df['Sex'] = df['Sex'].astype(str).str.lower()
    df['Embarked'] = df['Embarked'].astype(str)

    # Feature Engineering (example: family size)
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    return df

# --------- Dataset 4 Preprocessing (Medical) ---------
def preprocess_dataset4(df):
    print("🔧 Preprocessing Dataset 4 (medical_data.csv)...")
    df = df.drop_duplicates()

    # Normalize names and genders
    df['Name'] = df['Name'].astype(str).str.title()
    df['Gender'] = df['Gender'].astype(str).str.capitalize()

    # Parse Dates
    df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce')
    df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], errors='coerce')
    df['Admission Duration'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

    # Fill missing test results
    df['Test Results'] = df['Test Results'].fillna("Unknown")

    return df

# --------- Run Both Preprocessors ---------
df1_cleaned = preprocess_dataset1(df1)
df2_cleaned = preprocess_dataset2(df2)
df3_cleaned = preprocess_dataset3(df3)
df4_cleaned = preprocess_dataset4(df4)


# --------- Save Outputs ---------
df1_cleaned.to_csv("output/preprocessed_seattle-weather.csv", index=False)
df2_cleaned.to_csv("output/preprocessed_weather_classification_data.csv", index=False)
df3_cleaned.to_csv("output/preprocessed_Titanic-Dataset.csv", index=False)
df3_cleaned.to_csv("output/preprocessed_healthcare_dataset.csv", index=False)


print("✅ Preprocessing complete. Output saved:")
print("- preprocessed_seattle-weather.csv")
print("- preprocessed_weather_classification_data.csv")
print("- preprocessed_healthcare_dataset.csv")


🔧 Preprocessing Dataset 1 (sample_data.csv)...
🔧 Preprocessing Dataset 2 (sample_weather.json)...
🔧 Preprocessing Dataset 3 (titanic_data.csv)...
🔧 Preprocessing Dataset 4 (medical_data.csv)...
✅ Preprocessing complete. Output saved:
- preprocessed_seattle-weather.csv
- preprocessed_weather_classification_data.csv
- preprocessed_healthcare_dataset.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Name'] = df['Name'].astype(str).str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Gender'] = df['Gender'].astype(str).str.capitalize()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce')
A value is trying