In [4]:
import pandas as pd
import numpy as np

# Generate training data function
def generate_training_data(n=10000):
    np.random.seed(42)
    data = []

    GEO_MAPPING = {
        'London': {'country': 'United Kingdom', 'latitude_range': (51.4, 51.7), 'longitude_range': (-0.5, 0.2)},
        'Paris': {'country': 'France', 'latitude_range': (48.8, 49.0), 'longitude_range': (2.2, 2.5)},
        'Berlin': {'country': 'Germany', 'latitude_range': (52.4, 52.6), 'longitude_range': (13.3, 13.6)},
        'Rome': {'country': 'Italy', 'latitude_range': (41.8, 42.0), 'longitude_range': (12.4, 12.6)},
        'Madrid': {'country': 'Spain', 'latitude_range': (40.3, 40.5), 'longitude_range': (-3.8, -3.6)},
        'Amsterdam': {'country': 'Netherlands', 'latitude_range': (52.3, 52.4), 'longitude_range': (4.8, 5.0)},
        'Stockholm': {'country': 'Sweden', 'latitude_range': (59.2, 59.4), 'longitude_range': (17.9, 18.2)},
        'Oslo': {'country': 'Norway', 'latitude_range': (59.8, 60.0), 'longitude_range': (10.6, 10.9)},
        'Dublin': {'country': 'Ireland', 'latitude_range': (53.3, 53.4), 'longitude_range': (-6.3, -6.1)},
        'Lisbon': {'country': 'Portugal', 'latitude_range': (38.7, 38.8), 'longitude_range': (-9.2, -9.0)}
    }

    cities = list(GEO_MAPPING.keys())

    for i in range(n):
        # Feature: Building Metadata
        b_type = np.random.choice(['Office', 'Residential', 'Education', 'Healthcare'])
        gfa = np.random.randint(1000, 50000)
        floors = np.random.randint(1, 40)
        year = np.random.randint(1980, 2025)
        structure_type = np.random.choice(['Steel', 'Concrete', 'Timber', 'Mixed'])

        # Physics-based Logic
        base_intensity = 0
        if structure_type == 'Concrete': base_intensity = 450
        elif structure_type == 'Steel': base_intensity = 550
        elif structure_type == 'Timber': base_intensity = 180
        elif structure_type == 'Mixed': base_intensity = 350

        # Adjusters
        height_penalty = floors * 5
        type_adjuster = 1.2 if b_type == 'Healthcare' else 1.0

        # Final intensity with random variance
        real_intensity = (base_intensity + height_penalty) * type_adjuster * np.random.uniform(0.9, 1.1)
        total_carbon = (real_intensity * gfa) / 1000

        # Select a random city and its geographic data
        selected_city = np.random.choice(cities)
        city_data = GEO_MAPPING[selected_city]
        country = city_data['country']
        lat_min, lat_max = city_data['latitude_range']
        lon_min, lon_max = city_data['longitude_range']

        # Generate Latitude and Longitude within the city's range
        latitude = np.random.uniform(lat_min, lat_max)
        longitude = np.random.uniform(lon_min, lon_max)

        data.append({
            'GFA_sqm': gfa,
            'Building_Type': b_type,
            'Num_Floors': floors,
            'Year_Built': year,
            'Structure_Type': structure_type,
            'Actual_Scope3_tCO2e': total_carbon,
            'Intensity_kgCO2e_m2': real_intensity,
            'City': selected_city,
            'Country': country,
            'Latitude': latitude,
            'Longitude': longitude
        })

    return pd.DataFrame(data)

# Generate the data with the updated function
df_train = generate_training_data(10000)

# Display info to confirm changes
print("Training Data Sample with new City, Country, Latitude, and Longitude:")
print(df_train.head())
print("\nColumns:", df_train.columns.tolist())
print("Shape:", df_train.shape)


Training Data Sample with new City, Country, Latitude, and Longitude:
   GFA_sqm Building_Type  Num_Floors  Year_Built Structure_Type  \
0    16795     Education          29        1994         Timber   
1    22962     Education          24        2015          Mixed   
2     3433   Residential          21        2012          Mixed   
3    19431   Residential          28        1995         Timber   
4    12394        Office          37        1986          Steel   

   Actual_Scope3_tCO2e  Intensity_kgCO2e_m2       City      Country  \
0          5763.706673           343.179915     Madrid        Spain   
1         11241.249678           489.558822  Amsterdam  Netherlands   
2          1596.895782           465.160438     Dublin      Ireland   
3          6163.289335           317.188479     Berlin      Germany   
4          9019.403696           727.723390      Paris       France   

    Latitude  Longitude  
0  40.331204  -3.768801  
1  52.305641   4.944400  
2  53.329123  -6.17762

In [5]:
# # ==========================================
# 1a. STORING THE DATA (Optional)
# ==========================================
# Objective: Save the generated training data to a CSV file for future use.

df_train.to_csv('df_train-new-location.csv', index=False)
print("df_train-new-location.csv has been saved to the current directory.")

df_train-new-location.csv has been saved to the current directory.
