In [13]:
import pandas as pd

malaria_df = pd.read_csv('./data/raw/malaria_nga.csv')
print(malaria_df.head())


              GHO (CODE)                                      GHO (DISPLAY)  \
0        #indicator+code                                    #indicator+name   
1    MALARIA_TOTAL_CASES  Total number of  malaria cases (presumed + con...   
2     MALARIA_CONF_CASES                  Number of confirmed malaria cases   
3        MALARIA_RDT_POS  Number of malaria positive cases by rapid diag...   
4  MALARIA_EST_MORTALITY  Estimated malaria mortality rate (per 100 000 ...   

                                           GHO (URL) YEAR (DISPLAY)  \
0                                     #indicator+url     #date+year   
1  https://www.who.int/data/gho/data/indicators/i...           2015   
2  https://www.who.int/data/gho/data/indicators/i...           2015   
3  https://www.who.int/data/gho/data/indicators/i...           2017   
4  https://www.who.int/data/gho/data/indicators/i...           2010   

          STARTYEAR         ENDYEAR REGION (CODE) REGION (DISPLAY)  \
0  #date+year+start  #date+y

In [14]:
print(malaria_df.columns.tolist())

['GHO (CODE)', 'GHO (DISPLAY)', 'GHO (URL)', 'YEAR (DISPLAY)', 'STARTYEAR', 'ENDYEAR', 'REGION (CODE)', 'REGION (DISPLAY)', 'COUNTRY (CODE)', 'COUNTRY (DISPLAY)', 'DIMENSION (TYPE)', 'DIMENSION (CODE)', 'DIMENSION (NAME)', 'Numeric', 'Value', 'Low', 'High']


In [23]:
malaria_df = pd.read_csv('./data/raw/malaria_nga.csv')
# Convert year column to int
malaria_df['YEAR (DISPLAY)'] = pd.to_numeric(malaria_df['YEAR (DISPLAY)'], errors='coerce')
malaria_df = malaria_df.dropna(subset=['YEAR (DISPLAY)'])
malaria_df['YEAR (DISPLAY)'] = malaria_df['YEAR (DISPLAY)'].astype(int)


# Convert indicator value column to numeric, forcing errors to NaN
malaria_df['Numeric'] = pd.to_numeric(malaria_df['Numeric'], errors='coerce')
malaria_df = malaria_df.dropna(subset=['Numeric'])


# Filter for Nigeria data only
nigeria_df = malaria_df[malaria_df['COUNTRY (DISPLAY)'] == 'Nigeria'].copy()

print("Malaria data loaded:", nigeria_df.shape)

# Assign Nigeria centroid coordinates (latitude, longitude)
nigeria_centroid = (9.0820, 8.6753)  # Approximate centroid of Nigeria
nigeria_df['latitude'] = nigeria_centroid[0]
nigeria_df['longitude'] = nigeria_centroid[1]

print("Malaria data with assigned Nigeria centroid coordinates:", nigeria_df.shape)

# Print the first few rows
nigeria_df.to_csv('./data/cleaned/nigeria_malaria_cases_with_coords.csv', index=False)
print("Epidemiological data saved to CSV!")

Malaria data loaded: (129, 17)
Malaria data with assigned Nigeria centroid coordinates: (129, 19)
Epidemiological data saved to CSV!


In [26]:
import rasterio
import numpy as np

# Define file paths (update to your actual paths)
nir_path = './data/raw/B08_10m.jp2'
red_path = './data/raw/B04_10m.jp2'

# Load NIR band (B08) and Red band (B04)
with rasterio.open(nir_path) as nir:
    nir_data = nir.read(1).astype('float32')
with rasterio.open(red_path) as red:
    red_data = red.read(1).astype('float32')

# Calculate NDVI
ndvi_s2 = (nir_data - red_data) / (nir_data + red_data)
ndvi_s2[np.isinf(ndvi)] = np.nan  # Handle division by zero
print("Sentinel-2 NDVI calculated!")
print(f"NDVI min: {np.nanmin(ndvi_s2)}, max: {np.nanmax(ndvi_s2)}")



Sentinel-2 NDVI calculated!
NDVI min: -0.07861506938934326, max: 0.6142119765281677


In [27]:
import xarray as xr
import numpy as np

# Paths to key files
band_red = './data/raw/Oa08_radiance.nc'   # Red
band_nir = './data/raw/Oa17_radiance.nc'   # NIR

# Load radiance data
red_ds = xr.open_dataset(band_red)
nir_ds = xr.open_dataset(band_nir)

# Extract radiance values
red = red_ds['Oa08_radiance'].values
nir = nir_ds['Oa17_radiance'].values

# Calculate NDVI
ndvi_s3 = np.where(
    (nir + red) == 0, 
    np.nan, 
    np.divide((nir - red), (nir + red), where=(~np.isnan(nir)) & (~np.isnan(red)))
)

print(f"Sentinel-3 NDVI calculated: min={np.nanmin(ndvi_s3)}, max={np.nanmax(ndvi_s3)}")

# --- SENTINEL-3 TEMPERATURE extraction ---
temp_path = './data/raw/tie_meteo.nc'
temp_ds = xr.open_dataset(temp_path)

try:
    temperature_profile = temp_ds['atmospheric_temperature_profile'].values
    print(f"Temperature profile loaded: shape={temperature_profile.shape}")
    
    # Extract temperature at first pressure level (simplification)
    temperature_s3 = temperature_profile[:, :, 0]
    print(f"Extracted temperature at first pressure level: min={np.min(temperature_s3)}, max={np.max(temperature_s3)}")
except KeyError:
    print("Temperature profile not found in the dataset.")

Sentinel-3 NDVI calculated: min=-0.8271577954292297, max=0.6390857696533203
Temperature profile loaded: shape=(15002, 77, 25)
Extracted temperature at first pressure level: min=264.2359924316406, max=307.05078125


In [40]:
import pandas as pd
import numpy as np

# Step 1: Define States and Years
states = [
    'Lagos', 'Kano', 'Kaduna', 'Rivers', 'Oyo', 'Enugu', 
    'Abuja', 'Borno', 'Sokoto', 'Ondo', 'Benue', 'Delta'
]
years = [2015, 2016, 2017, 2018, 2019, 2020]

# Step 2: Set Random Seed
np.random.seed(42)

# Step 3: Generate Fake Data
fake_cases = np.random.randint(10000, 50000, size=(len(states) * len(years)))
fake_s2_ndvi = np.random.uniform(-0.0786, 0.6142, size=(len(states) * len(years)))
fake_s3_ndvi = np.random.uniform(-0.8271, 0.6391, size=(len(states) * len(years)))
fake_temp = np.random.uniform(264.236, 307.051, size=(len(states) * len(years)))

# Step 4: Create a DataFrame for Fake Data
data = {
    'STATE': np.repeat(states, len(years)),
    'YEAR (DISPLAY)': years * len(states),
    'Numeric': fake_cases,
    'S2_NDVI': fake_s2_ndvi,
    'S3_NDVI': fake_s3_ndvi,
    'S3_Temperature': fake_temp
}
fake_df = pd.DataFrame(data)

# Step 5: Add Coordinates
state_coords = {
    'Lagos': (6.5244, 3.3792),
    'Kano': (12.0022, 8.5919),
    'Kaduna': (10.5098, 7.4165),
    'Rivers': (4.8156, 7.0498),
    'Oyo': (7.3775, 3.9470),
    'Enugu': (6.5244, 7.5189),
    'Abuja': (9.0765, 7.3986),
    'Borno': (11.8333, 13.1500),
    'Sokoto': (13.0059, 5.2476),
    'Ondo': (7.1000, 4.8353),
    'Benue': (7.3373, 8.7417),
    'Delta': (5.8904, 5.6800)
}

fake_df['latitude'] = fake_df['STATE'].map(lambda x: state_coords[x][0])
fake_df['longitude'] = fake_df['STATE'].map(lambda x: state_coords[x][1])

# Step 6: Save the Fake Data to CSV
fake_df.to_csv('./data/cleaned/fake_state_data_with_coords.csv', index=False)
print("Fake state data with coordinates saved!")

# Step 7: Load Existing Merged Data (if any)
try:
    merged_df = pd.read_csv('./data/cleaned/merged_data.csv')
    print("Existing merged data loaded!")
except FileNotFoundError:
    print("No existing merged data found, proceeding with fake data only.")
    merged_df = fake_df.copy()

# Step 8: Merge Fake Data with Existing Data
merged_with_coords = pd.merge(
    merged_df, fake_df, 
    on=['STATE', 'YEAR (DISPLAY)'], 
    how='outer', 
    suffixes=('_existing', '_fake')
)

# Check column names after merging
print("Columns after merging:", merged_with_coords.columns)

# Fix column names to avoid conflicts
merged_with_coords.rename(columns={
    'Numeric_existing': 'Numeric_x',
    'Numeric_fake': 'Numeric_y'
}, inplace=True)

# Save the final merged data with coordinates
merged_with_coords.to_csv('./data/cleaned/merged_data_with_coords.csv', index=False)
print("Final merged data with coordinates saved!")

# Display a sample of the final merged data
print("\nFinal Merged Data (Sample):\n", merged_with_coords.head())



Fake state data with coordinates saved!
Existing merged data loaded!
Columns after merging: Index(['GHO (CODE)', 'GHO (DISPLAY)', 'GHO (URL)', 'YEAR (DISPLAY)',
       'STARTYEAR', 'ENDYEAR', 'REGION (CODE)', 'REGION (DISPLAY)',
       'COUNTRY (CODE)', 'COUNTRY (DISPLAY)', 'DIMENSION (TYPE)',
       'DIMENSION (CODE)', 'DIMENSION (NAME)', 'Numeric_x', 'Value', 'Low',
       'High', 'STATE', 'Numeric_y', 'S2_NDVI_existing', 'S3_NDVI_existing',
       'S3_Temperature_existing', 'Numeric', 'S2_NDVI_fake', 'S3_NDVI_fake',
       'S3_Temperature_fake', 'latitude', 'longitude'],
      dtype='object')
Final merged data with coordinates saved!

Final Merged Data (Sample):
             GHO (CODE)                                      GHO (DISPLAY)  \
0  MALARIA_TOTAL_CASES  Total number of  malaria cases (presumed + con...   
1   MALARIA_CONF_CASES                  Number of confirmed malaria cases   
2    MALARIA_MICR_TEST  Number of malaria suspects examined by microscopy   
3      MALARIA_RD

In [29]:
# Load the original epidemiological data
nigeria_df = pd.read_csv('./data/cleaned/nigeria_malaria_cases.csv')

# Merge with the fake data using the YEAR (DISPLAY) column
merged_df = pd.merge(nigeria_df, fake_df, on='YEAR (DISPLAY)', how='outer')

# Save the merged data
merged_df.to_csv('./data/cleaned/merged_data.csv', index=False)
print("Merged data saved to CSV!")

# Check the merged data
print("Merged Data (Sample):\n", merged_df.head())

Merged data saved to CSV!
Merged Data (Sample):
               GHO (CODE)                                      GHO (DISPLAY)  \
0  MALARIA_EST_MORTALITY  Estimated malaria mortality rate (per 100 000 ...   
1  MALARIA_EST_INCIDENCE  Estimated malaria incidence (per 1000 populati...   
2  MALARIA_EST_MORTALITY  Estimated malaria mortality rate (per 100 000 ...   
3  MALARIA_EST_INCIDENCE  Estimated malaria incidence (per 1000 populati...   
4  MALARIA_EST_INCIDENCE  Estimated malaria incidence (per 1000 populati...   

                                           GHO (URL)  YEAR (DISPLAY)  \
0  https://www.who.int/data/gho/data/indicators/i...            2000   
1  https://www.who.int/data/gho/data/indicators/i...            2000   
2  https://www.who.int/data/gho/data/indicators/i...            2001   
3  https://www.who.int/data/gho/data/indicators/i...            2001   
4  https://www.who.int/data/gho/data/indicators/i...            2002   

   STARTYEAR  ENDYEAR REGION (CODE) REGION 

In [42]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

# Step 1: Load the data
try:
    merged_df = pd.read_csv('./data/cleaned/merged_data_with_coords.csv')
    print("Data loaded successfully!")
except Exception as e:
    print(f"Error loading data: {e}")
    exit()

# Quick data check
print("Columns available:", merged_df.columns.tolist())
print("Sample data:\n", merged_df.head())

# Step 2: Select features and target
features = ['S2_NDVI', 'S3_NDVI', 'S3_Temperature', 'YEAR (DISPLAY)']

# Fill missing values in target column
merged_df['Target'] = merged_df['Numeric_x'].fillna(merged_df['Numeric_y'])

# Handle missing values in features by simple imputation (mean)
for feature in features:
    if merged_df[feature].isnull().any():
        mean_value = merged_df[feature].mean()
        merged_df[feature].fillna(mean_value, inplace=True)
        print(f"Filled missing values in {feature} with mean: {mean_value:.4f}")

# Prepare input and output
X = merged_df[features]
y = merged_df['Target']

# Step 3: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training completed!")

# Step 5: Predict on test set
y_pred = model.predict(X_test)

# Step 6: Evaluate model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Model RMSE: {rmse:.2f}")

# Step 7: Save model
os.makedirs('./models', exist_ok=True)
joblib.dump(model, './models/malaria_predictor.pkl')
print("Model saved as './models/malaria_predictor.pkl'!")



Data loaded successfully!
Columns available: ['GHO (CODE)', 'GHO (DISPLAY)', 'GHO (URL)', 'YEAR (DISPLAY)', 'STARTYEAR', 'ENDYEAR', 'REGION (CODE)', 'REGION (DISPLAY)', 'COUNTRY (CODE)', 'COUNTRY (DISPLAY)', 'DIMENSION (TYPE)', 'DIMENSION (CODE)', 'DIMENSION (NAME)', 'Numeric_x', 'Value', 'Low', 'High', 'STATE', 'Numeric_y', 'S2_NDVI_existing', 'S3_NDVI_existing', 'S3_Temperature_existing', 'Numeric', 'S2_NDVI_fake', 'S3_NDVI_fake', 'S3_Temperature_fake', 'latitude', 'longitude']
Sample data:
             GHO (CODE)                                      GHO (DISPLAY)  \
0  MALARIA_TOTAL_CASES  Total number of  malaria cases (presumed + con...   
1   MALARIA_CONF_CASES                  Number of confirmed malaria cases   
2    MALARIA_MICR_TEST  Number of malaria suspects examined by microscopy   
3      MALARIA_RDT_POS  Number of malaria positive cases by rapid diag...   
4     MALARIA_SUSPECTS                  Number of suspected malaria cases   

                                      

KeyError: 'S2_NDVI'

In [36]:
import folium

# Create base map centered on Nigeria
m = folium.Map(location=[9.0820, 8.6753], zoom_start=6)

# Add markers
for _, row in merged_df.iterrows():
    if not pd.isna(row['latitude']) and not pd.isna(row['longitude']):
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=7,
            popup=f"{row['STATE']} ({row['YEAR (DISPLAY)']}): {int(row['Target'])} cases",
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.6
        ).add_to(m)

# Save to html file
m.save('./data/cleaned/malaria_map.html')
print("Map saved as malaria_map.html")


Prediction: [13457390.23354962]




In [43]:
import folium

# Create base map centered on Nigeria
m = folium.Map(location=[9.0820, 8.6753], zoom_start=6)

# Add markers
for _, row in merged_df.iterrows():
    if not pd.isna(row['latitude']) and not pd.isna(row['longitude']):
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=7,
            popup=f"{row['STATE']} ({row['YEAR (DISPLAY)']}): {int(row['Target'])} cases",
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.6
        ).add_to(m)

# Save to html file
m.save('./data/cleaned/malaria_map.html')
print("Map saved as malaria_map.html")



Map saved as malaria_map.html


In [46]:
import streamlit as st
import joblib
import numpy as np

# Load model
model = joblib.load('./models/malaria_predictor.pkl')

st.title("Malaria Cases Predictor for Nigeria")

# Input fields
state = st.selectbox("Select State", ['Lagos', 'Kano', 'Kaduna', 'Rivers', 'Oyo', 'Enugu', 'Abuja', 'Borno', 'Sokoto', 'Ondo', 'Benue', 'Delta'])
year = st.slider("Select Year", 2015, 2020, 2018)

s2_ndvi = st.number_input("Enter Sentinel-2 NDVI", min_value=-1.0, max_value=1.0, value=0.2)
s3_ndvi = st.number_input("Enter Sentinel-3 NDVI", min_value=-1.0, max_value=1.0, value=0.1)
s3_temp = st.number_input("Enter Sentinel-3 Temperature (Kelvin)", min_value=250.0, max_value=320.0, value=290.0)

if st.button("Predict Malaria Cases"):
    # Prepare features array
    features = np.array([[s2_ndvi, s3_ndvi, s3_temp, year]])
    prediction = model.predict(features)[0]
    st.success(f"Predicted Malaria Cases: {int(prediction)}")

st.markdown("---")
st.markdown("Use the map (`./data/cleaned/malaria_map.html`) to view malaria cases by state and year.")




DeltaGenerator()