In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

precipitation_files = sorted(os.listdir("precipitation_csv/"))
solar_files = sorted(os.listdir("solar_radiation_csv/"))
temp_files = sorted(os.listdir("temperature_csv/"))

In [3]:
def prep_rain(f):
    df = pd.read_csv(f"precipitation_csv/{f}", header=10)
    country = "_".join(f.split("_")[:-6])

    df['AREA'] = country
    df = df[['AREA','YEAR', 
    'JAN', 'FEB', 'MAR', 
    'APR', 'MAY', 'JUN', 
    'JUL', 'AUG', 'SEP', 
    'OCT', 'NOV', 'DEC', 
    'ANN']]

    month_cols = ['JAN', 'FEB', 'MAR', 
    'APR', 'MAY', 'JUN', 
    'JUL', 'AUG', 'SEP', 
    'OCT', 'NOV', 'DEC']
    
    # Melt into long format
    df_long = df.melt(
        id_vars=["YEAR", "AREA"],
        value_vars=month_cols,
        var_name="MONTH",
        value_name="SUM"
    )
    
    # Clean month name: "jan_sum" → "jan"
    df_long["MONTH"] = df_long["MONTH"].map(lambda x: x.split("_")[0])
    
    # Optional: convert month abbreviations to numerical order
    month_map = {
        "JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
        "JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12
    }
    df_long["MONTH"] = df_long["MONTH"].map(month_map)
    
    # Sort nicely
    df_long = df_long.sort_values(["AREA", "YEAR", "MONTH"])
    df_long = df_long.dropna()
    
    df_long["MONTH"] = df_long["MONTH"].astype(int).astype(str).str.zfill(2)
    df_long["YEAR"] = df_long["YEAR"].astype(str)
    df_long["DATE"] = pd.to_datetime(df_long["YEAR"] + "-" + df_long["MONTH"]) + pd.offsets.MonthEnd(0)
    df_long.columns = ['year', 'area', 'month', 'rain', 'date']
    df_long["rain"] = df_long["rain"].replace(-999, np.nan)
    
    return df_long.reset_index(drop=True)
    

In [4]:
def prep_solar(f):
    df = pd.read_csv(f"solar_radiation_csv/{f}", header=10)
    country = "_".join(f.split("_")[:-8])
    
    df['AREA'] = country
    df = df[['AREA','YEAR', 
    'JAN', 'FEB', 'MAR', 
    'APR', 'MAY', 'JUN', 
    'JUL', 'AUG', 'SEP', 
    'OCT', 'NOV', 'DEC', 
    'ANN']]

    month_cols = ['JAN', 'FEB', 'MAR', 
    'APR', 'MAY', 'JUN', 
    'JUL', 'AUG', 'SEP', 
    'OCT', 'NOV', 'DEC']
    
    # Melt into long format
    df_long = df.melt(
        id_vars=["YEAR", "AREA"],
        value_vars=month_cols,
        var_name="MONTH",
        value_name="SUM"
    )
    
    # Clean month name: "jan_sum" → "jan"
    df_long["MONTH"] = df_long["MONTH"].map(lambda x: x.split("_")[0])
    
    # Optional: convert month abbreviations to numerical order
    month_map = {
        "JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
        "JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12
    }
    df_long["MONTH"] = df_long["MONTH"].map(month_map)
    
    # Sort nicely
    df_long = df_long.sort_values(["AREA", "YEAR", "MONTH"])
    df_long = df_long.dropna()
    
    df_long["MONTH"] = df_long["MONTH"].astype(int).astype(str).str.zfill(2)
    df_long["YEAR"] = df_long["YEAR"].astype(str)
    df_long["DATE"] = pd.to_datetime(df_long["YEAR"] + "-" + df_long["MONTH"]) + pd.offsets.MonthEnd(0)
    df_long.columns = ['year', 'area', 'month', 'solar', 'date']
    df_long["solar"] = df_long["solar"].replace(-999, np.nan)


    return df_long.reset_index(drop=True)
    

In [5]:
def prep_temp(f):
    df = pd.read_csv(f"temperature_csv/{f}", header=10)
    country = "_".join(f.split("_")[:-5])
    
    df['AREA'] = country
    df = df[['AREA','YEAR', 
    'JAN', 'FEB', 'MAR', 
    'APR', 'MAY', 'JUN', 
    'JUL', 'AUG', 'SEP', 
    'OCT', 'NOV', 'DEC', 
    'ANN']]

        # Identify month columns (all *_sum except annual_sum)
    month_cols = ['JAN', 'FEB', 'MAR', 
    'APR', 'MAY', 'JUN', 
    'JUL', 'AUG', 'SEP', 
    'OCT', 'NOV', 'DEC', 
    'ANN']
    
    # Melt into long format
    df_long = df.melt(
        id_vars=["YEAR", "AREA"],
        value_vars=month_cols,
        var_name="MONTH",
        value_name="SUM"
    )
    
    # Clean month name: "jan_sum" → "jan"
    df_long["MONTH"] = df_long["MONTH"].map(lambda x: x.split("_")[0])
    
    # Optional: convert month abbreviations to numerical order
    month_map = {
        "JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6,
        "JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12
    }
    df_long["MONTH"] = df_long["MONTH"].map(month_map)
    
    # Sort nicely
    df_long = df_long.sort_values(["AREA", "YEAR", "MONTH"])
    df_long = df_long.dropna()
    
    df_long["MONTH"] = df_long["MONTH"].astype(int).astype(str).str.zfill(2)
    df_long["YEAR"] = df_long["YEAR"].astype(str)
    df_long["DATE"] = pd.to_datetime(df_long["YEAR"] + "-" + df_long["MONTH"]) + pd.offsets.MonthEnd(0)
    df_long.columns = ['year', 'area', 'month', 'temp', 'date']
    df_long["temp"] = df_long["temp"].replace(-999, np.nan)


    return df_long.reset_index(drop=True)
    

In [6]:
rain_data = pd.concat([prep_rain(f) for f in precipitation_files])
solar_data = pd.concat([prep_solar(f) for f in solar_files])
temp_data = pd.concat([prep_temp(f) for f in temp_files])

In [11]:
nasa_df = rain_data.merge(
    solar_data, on=['year', 'area', 'date', 'month']
).merge(
    temp_data, on=['year', 'area', 'date', 'month']
)

In [14]:
nasa_df = nasa_df[['date', 'area', 'rain', 'solar', 'temp']]
nasa_df.to_parquet('nasa_df.parquet')

In [70]:
nasa_df

Unnamed: 0,date,area,rain,solar,temp
0,1981-01-31,Afghanistan,55.53,,-0.95
1,1981-02-28,Afghanistan,85.20,,0.97
2,1981-03-31,Afghanistan,66.13,,6.18
3,1981-04-30,Afghanistan,23.64,,13.07
4,1981-05-31,Afghanistan,23.92,,17.61
...,...,...,...,...,...
107839,2023-08-31,Zimbabwe,0.10,19.47,18.23
107840,2023-09-30,Zimbabwe,2.65,22.55,23.64
107841,2023-10-31,Zimbabwe,137.78,23.96,24.80
107842,2023-11-30,Zimbabwe,27.16,26.16,26.37


In [91]:
yield_crop = pd.read_csv('yield final.csv')

In [95]:
yield_crop = yield_crop[['Area', 'Item', 'Year', 'Yield (kg/ha)']]

In [97]:
yield_crop.columns = ['area', 'item', 'year', 'label']

In [100]:
from datetime import date

In [101]:
yield_crop['area'] = yield_crop['area'].str.replace(' ', '_')
yield_crop['year'] = yield_crop['year'].map(lambda x: date(int(x), 12, 31))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yield_crop['area'] = yield_crop['area'].str.replace(' ', '_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yield_crop['year'] = yield_crop['year'].map(lambda x: date(int(x), 12, 31))


In [105]:
yield_crop.to_parquet('label_yield.parquet')

In [106]:
yield_crop

Unnamed: 0,area,item,year,label
0,Afghanistan,Maize (corn),1970-12-31,1475.7
1,Afghanistan,Maize (corn),1971-12-31,1340.0
2,Afghanistan,Maize (corn),1972-12-31,1565.2
3,Afghanistan,Maize (corn),1973-12-31,1617.0
4,Afghanistan,Maize (corn),1974-12-31,1617.0
...,...,...,...,...
89255,Zimbabwe,Watermelons,2019-12-31,25000.0
89256,Zimbabwe,Watermelons,2020-12-31,36000.0
89257,Zimbabwe,Watermelons,2021-12-31,31377.0
89258,Zimbabwe,Watermelons,2022-12-31,33841.3
