In [31]:
import pandas as pd
import re
import country_converter as coco
import csv

# SETUP & DATA ACQUISITION
deals = pd.read_csv('deals.csv', sep=';', low_memory=False)
contracts = pd.read_csv('contracts.csv', sep=';', low_memory=False)
locations = pd.read_csv('locations.csv', sep=';', low_memory=False)
investors = pd.read_csv('investors.csv', sep=';', low_memory=False)

# 1. DATA INTEGRATION (MERGING)
investors_df = investors.add_prefix('Investor_')

merged_df = deals.merge(contracts[['Deal ID', 'Contract date']], on='Deal ID', how='left')
merged_df = merged_df.merge(locations[['Deal ID', 'Spatial accuracy level', 'Point']], on='Deal ID', how='left')
merged_df = merged_df.merge(
    investors_df[['Investor_Investor ID', 'Investor_Country of registration/origin']], 
    left_on='Operating company: Investor ID', 
    right_on='Investor_Investor ID', 
    how='left'
)

target_features = [
    'Deal ID', 'Target country', 'Deal size', 'Deal scope', 'Intention of investment',
    'Current negotiation status', 'Current implementation status', 'Created at',
    'Presence of land conflicts', 'Displacement of people', 'Spatial accuracy level',
    'Point', 'Nature of the deal', 'Carbon offset project',
    'Operating company: Country of registration/origin',
    'Investor_Country of registration/origin'
]

df = merged_df[target_features].copy()

# 2. SECTOR HARMONIZATION (TAXONOMY MAPPING)
TRUNCATION_MAP = {
    "Oth": "Other", "Fodd": "Fodder", "Conservatio": "Conservation",
    "Land speculatio": "Land speculation", "Oil / Gas extractio": "Oil / Gas extraction"
}

SECTOR_TAXONOMY = {
    "Food crops": "Food crops", "Livestock": "Livestock", "Mining": "Mining",
    "Non-food agricultural commodities": "Non-food agricultural commodities",
    "Biomass for biofuels": "Biomass for biofuels", "Fodder": "Fodder",
    "Oil / Gas extraction": "Oil / Gas extraction", 
    "Forest logging / management": "Forestry Management",
    "Timber plantation": "Timber Plantation", "Wind farm": "Wind Farm",
    "Solar park": "Solar Park", "Renewable energy": "Renewable Energy",
    "For carbon sequestration/REDD": "Carbon Sequestration / REDD",
    "Industry": "Industry", "Tourism": "Tourism", "Land speculation": "Land speculation",
    "Agriculture unspecified": "Agriculture Unspecified", "Conservation": "Conservation"
}

def clean_sectors(raw_val):
    if pd.isna(raw_val): return ""
    s = str(raw_val).strip()
    s = re.sub(r"\|[0-9#.\-a-zA-Z]*", "", s)
    if "#" in s:
        s = re.sub(r"^.*#(?!.*#)", "", s).lstrip("#")
    for bad, good in TRUNCATION_MAP.items():
        s = s.replace(bad, good)
    parts = re.split(r"\s*,\s*", s)
    matches = []
    for p in parts:
        for key, val in SECTOR_TAXONOMY.items():
            if key.lower() in p.lower():
                matches.append(val)
    return "; ".join(sorted(set(matches)))

df['sectors_cleaned'] = df['Intention of investment'].apply(clean_sectors)

# NACE MAPPING & BINARIES ---
# Standardizing the 18 sectors to the 16-Sector Taxonomy and NACE codes
NACE_MAP = {
    "Agriculture Unspecified": ("1. Agriculture & Livestock", "A - Agriculture, forestry and fishing", "01"),
    "Fodder": ("1. Agriculture & Livestock", "A - Agriculture, forestry and fishing", "01"),
    "Food crops": ("1. Agriculture & Livestock", "A - Agriculture, forestry and fishing", "01"),
    "Livestock": ("1. Agriculture & Livestock", "A - Agriculture, forestry and fishing", "01"),
    "Non-food agricultural commodities": ("1. Agriculture & Livestock", "A - Agriculture, forestry and fishing", "01"),
    "Forestry Management": ("2. Forestry Timber & Logging", "A - Agriculture, forestry and fishing", "02"),
    "Timber Plantation": ("2. Forestry Timber & Logging", "A - Agriculture, forestry and fishing", "02"),
    "Carbon Sequestration / REDD": ("3. Forestry - Carbon & Conservation", "A - Agriculture, forestry and fishing", "02"),
    "Conservation": ("3. Forestry - Carbon & Conservation", "A - Agriculture, forestry and fishing", "02"),
    "Mining": ("5. Mining & Quarrying", "B - Mining and quarrying", "07/08"),
    "Oil / Gas extraction": ("6. Oil & Gas Extraction", "B - Mining and quarrying", "06"),
    "Biomass for biofuels": ("7. Biofuels & Biomass Energy", "C - Manufacturing", "20"),
    "Renewable Energy": ("8. Renewable Energy (non-biomass)", "D - Electricity, gas, steam and air conditioning supply", "35"),
    "Solar Park": ("8. Renewable Energy (non-biomass)", "D - Electricity, gas, steam and air conditioning supply", "35"),
    "Wind Farm": ("8. Renewable Energy (non-biomass)", "D - Electricity, gas, steam and air conditioning supply", "35"),
    "Industry": ("10. Manufacturing & Processing", "C - Manufacturing", "10-33"),
    "Land speculation": ("11. Construction & Real Estate", "L - Real estate activities", "68"),
    "Tourism": ("15. Tourism & Hospitality", "I - Accommodation and food service activities", "55")
}

def map_to_nace(val):
    if not val: return "N/A", "N/A", "N/A"
    uni, sec, sub = set(), set(), set()
    for key, (u_name, n_sec, n_sub) in NACE_MAP.items():
        if key in str(val):
            uni.add(u_name); sec.add(n_sec); sub.add(n_sub)
    return "; ".join(sorted(uni)) or "N/A", "; ".join(sorted(sec)) or "N/A", "; ".join(sorted(sub)) or "N/A"

df[['sectors_unified', 'nace_sector', 'nace_subsector']] = df['sectors_cleaned'].apply(lambda x: pd.Series(map_to_nace(x)))

# Standard binary column names (sector_A, sector_B, etc.)
LETTERS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"]
for let in LETTERS:
    col_name = f"sector_{let}"
    df[col_name] = df['nace_sector'].apply(lambda x: 1 if re.search(rf"(^|;\s*){let}\b", str(x)) else 0)

# 3. GEOSPATIAL HARMONIZATION (ISO3 CODES)
cc = coco.CountryConverter()
ISO_CACHE = {}

def get_iso3_codes(country_string):
    if pd.isna(country_string): return None
    s = str(country_string).strip()
    if s.lower() in ["data not available", "selection deleted", "european union", ""]:
        return None
    countries = [c.strip() for c in s.split(";\n") if c.strip()]
    codes = []
    for c in countries:
        if c not in ISO_CACHE:
            res = cc.convert(names=c, to="ISO3", not_found=None)
            ISO_CACHE[c] = res
        if ISO_CACHE[c] and ISO_CACHE[c] != "not found":
            codes.append(ISO_CACHE[c])
    return "; ".join(sorted(set(codes))) if codes else None

COUNTRY_COLS = ["Target country", "Operating company: Country of registration/origin", "Investor_Country of registration/origin"]
for col in COUNTRY_COLS:
    df[f"{col}_iso3"] = df[col].apply(get_iso3_codes)

# DOWNLOADING FILE
df.to_csv('landmatrix_final_analytical_dataset.csv', index=False)

print(f"Pipeline complete. Dataset dimensions: {df.shape}")
print("Final file saved as: landmatrix_final_analytical_dataset.csv")

Pipeline complete. Dataset dimensions: (15019, 38)
Final file saved as: landmatrix_final_analytical_dataset.csv
