In [1]:
import requests
import pandas as pd
import numpy as np
import xarray as xr
import netCDF4 as nc
from datetime import datetime, timedelta
import os
import urllib.request
from urllib.parse import urljoin
import json
import time

In [54]:
import warnings
import requests
from datetime import datetime, timedelta
import json
import time
from geopy.geocoders import Nominatim
from pprint import pprint
import pandas as pd
from prophet import Prophet
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import pickle as pk
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from sklearn.metrics import (
            accuracy_score,
            precision_score, 
            recall_score,
            f1_score,
            roc_auc_score,
            confusion_matrix,
            precision_recall_curve,
            auc,
            mean_absolute_error,
            mean_squared_error,
            r2_score)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

warnings.filterwarnings("ignore", message="This Pipeline instance is not fitted yet")

class GLDASFetcher:
    def __init__(self, username=None, password=None):
        """
        Initialize GLDAS/POWER fetcher (username/password only needed for GLDAS OPeNDAP, 
        not for NASA POWER API).
        """
        self.username = username
        self.password = password

        # GLDAS variable names mapped to NASA POWER parameters
        self.variables_map = {
            'temp': ['T2M_MAX', 'T2M_MIN'],           # Daily max/min temp
            'humidity': ['QV2M'],                     # Specific humidity
            'pressure': ['PS'],                       # Surface pressure
            'precipitation': ['PRECTOTCORR'],         # Corrected daily precipitation
            'solar_rad': ['ALLSKY_SFC_SW_DWN'],       # Solar radiation
            'wind_speed': ['WS2M']                    # Wind speed
        }

    def get_data(self, lat, lon, start_date, end_date, variables=None):
        """
        Fetch daily NASA POWER data for a given location & date range.
        """
        variables = variables or list(self.variables_map.keys())
    
        base_url = "https://power.larc.nasa.gov/api/temporal/daily/point"
    
        # Collect POWER parameter codes
        power_params = []
        for var in variables:
            power_params.extend(self.variables_map.get(var, []))
    
        params = {
            'parameters': ','.join(power_params),
            'community': 'RE',
            'longitude': lon,
            'latitude': lat,
            'start': start_date.replace("-", ""),
            'end': end_date.replace("-", ""),
            'format': 'JSON'
        }
    
        print(f"üåç Fetching NASA POWER data for ({lat}, {lon}) from {start_date} to {end_date} ...")
        response = requests.get(base_url, params=params, timeout=60)
    
        if response.status_code != 200:
            print(f"‚ùå API error {response.status_code}")
            return pd.DataFrame()
    
        data = response.json()['properties']['parameter']
    
        # Build dataframe manually
        records = {}
        for var, timeseries in data.items():
            for date_str, value in timeseries.items():
                if date_str not in records:
                    records[date_str] = {}
                records[date_str][var] = value
    
        # Convert dict -> DataFrame
        df = pd.DataFrame.from_dict(records, orient='index')
        df.index = pd.to_datetime(df.index, format="%Y%m%d")  # Proper date parsing
        df.index.name = "date"
        df.reset_index(inplace=True)
        
        # Add metadata
        df['latitude'] = lat
        df['longitude'] = lon

        columns = ["date","temp_max","temp_min","humidity_specific","pressure","precipitation_total","solar_radiation","wind_speed","lat","lon"]
        df.columns = columns
        print(f"‚úÖ Retrieved {len(df)} daily records")
        return df

    def get_bulk_data(self, locations, start_date, end_date, variables=None):
        """
        Fetch data for multiple locations.

        Args:
            locations (list): [(lat, lon), ...]
            start_date (str): YYYY-MM-DD
            end_date (str): YYYY-MM-DD
            variables (list): Variables list

        Returns:
            dict: { "lat_lon": DataFrame }
        """
        results = {}
        for lat, lon in locations:
            df = self.get_data(lat, lon, start_date, end_date, variables)
            if not df.empty:
                key = f"lat_{lat}_lon_{lon}"
                results[key] = df
        return results

    def to_csv(self, data, filename):
        """Export single DataFrame or dict of DataFrames to CSV"""
        if isinstance(data, dict):
            all_data = []
            for loc, df in data.items():
                df_copy = df.copy()
                df_copy['location'] = loc
                all_data.append(df_copy)
            combined = pd.concat(all_data, ignore_index=True)
            combined.to_csv(filename, index=False)
        else:
            data.to_csv(filename, index=False)




    def get_location_by_address(self, address):
        """Return location data from an address, retrying if failed."""
        time.sleep(1)
        geolocator = Nominatim(user_agent="gldas_fetcher")
        try:
            return geolocator.geocode(address).raw
        except:
            return self.get_location_by_address(address)  # Recursive retry


    def main():
        """Example usage with NASA data"""
        
        print("üöÄ NASA GLDAS Data Fetcher")
        print("=" * 50)
        
        
        username = "mahmoudmo12"
        password = "Mahmoudmetawe12@"
        
    
        # Initialize with real credentials
        fetcher = GLDASFetcher(username=username, password=password)
        city = input("enter the city: ")
        
        
        location = fetcher.get_location_by_address(city)
        lat = location["lat"]
        lon = location["lon"]
        city_name = location['display_name']
        #start_date = input("enter the start date formula (yyyy-mm-dd): ")
        #end_date = input("enter the end date formula (yyyy-mm-dd): ")
        
        # Test with a single location
        print(f"\nüåç Fetching DAILY data for {city_name}")
        
        data = fetcher.get_data(
            lat=lat,
            lon=lon,
            start_date="1984-01-01",
            end_date="2025-10-20",
            variables=['temp', 'humidity', 'pressure', 'precipitation', 'solar_rad', 'wind_speed']
        )
        
        if not data.empty:
            print(f"\n‚úÖ SUCCESS! Retrieved {len(data)} real data points")
            print(f"üìã Columns: {list(data.columns)}")
            
            # Export real data
            fetcher.to_csv(data, "nasa_daily_weather_data.csv")
            # import the data
            df = pd.read_csv("nasa_daily_weather_data.csv")
        else:
            print("‚ùå")     
        return df 



In [55]:
GLDASFetcher.main()

üöÄ NASA GLDAS Data Fetcher


enter the city:  cairo



üåç Fetching DAILY data for ÿßŸÑŸÇÿßŸáÿ±ÿ©, ŸÖÿµÿ±
üåç Fetching NASA POWER data for (30.0443879, 31.2357257) from 1984-01-01 to 2025-10-20 ...
‚úÖ Retrieved 15269 daily records

‚úÖ SUCCESS! Retrieved 15269 real data points
üìã Columns: ['date', 'temp_max', 'temp_min', 'humidity_specific', 'pressure', 'precipitation_total', 'solar_radiation', 'wind_speed', 'lat', 'lon']


Unnamed: 0,date,temp_max,temp_min,humidity_specific,pressure,precipitation_total,solar_radiation,wind_speed,lat,lon
0,1984-01-01,19.55,8.60,6.21,100.37,0.08,3.5004,2.82,30.044388,31.235726
1,1984-01-02,19.51,8.81,6.00,100.85,0.00,3.3655,2.24,30.044388,31.235726
2,1984-01-03,19.69,7.56,6.24,100.89,0.06,3.4342,1.74,30.044388,31.235726
3,1984-01-04,18.43,7.94,6.17,100.55,0.01,3.2314,1.86,30.044388,31.235726
4,1984-01-05,18.31,6.57,5.72,100.46,0.00,3.4630,1.35,30.044388,31.235726
...,...,...,...,...,...,...,...,...,...,...
15264,2025-10-16,30.29,17.13,8.90,99.84,0.00,5.2174,3.07,30.044388,31.235726
15265,2025-10-17,30.40,17.22,8.83,100.07,0.00,5.1578,3.06,30.044388,31.235726
15266,2025-10-18,30.90,16.57,8.30,100.24,0.00,5.2183,3.13,30.044388,31.235726
15267,2025-10-19,31.53,16.40,8.64,100.09,0.00,5.1418,2.88,30.044388,31.235726


In [56]:
df = pd.read_csv("nasa_daily_weather_data.csv")

In [57]:
df

Unnamed: 0,date,temp_max,temp_min,humidity_specific,pressure,precipitation_total,solar_radiation,wind_speed,lat,lon
0,1984-01-01,19.55,8.60,6.21,100.37,0.08,3.5004,2.82,30.044388,31.235726
1,1984-01-02,19.51,8.81,6.00,100.85,0.00,3.3655,2.24,30.044388,31.235726
2,1984-01-03,19.69,7.56,6.24,100.89,0.06,3.4342,1.74,30.044388,31.235726
3,1984-01-04,18.43,7.94,6.17,100.55,0.01,3.2314,1.86,30.044388,31.235726
4,1984-01-05,18.31,6.57,5.72,100.46,0.00,3.4630,1.35,30.044388,31.235726
...,...,...,...,...,...,...,...,...,...,...
15264,2025-10-16,30.29,17.13,8.90,99.84,0.00,5.2174,3.07,30.044388,31.235726
15265,2025-10-17,30.40,17.22,8.83,100.07,0.00,5.1578,3.06,30.044388,31.235726
15266,2025-10-18,30.90,16.57,8.30,100.24,0.00,5.2183,3.13,30.044388,31.235726
15267,2025-10-19,31.53,16.40,8.64,100.09,0.00,5.1418,2.88,30.044388,31.235726


In [58]:
df[df==-999.00].count()

date                   0
temp_max               0
temp_min               0
humidity_specific      0
pressure               0
precipitation_total    0
solar_radiation        0
wind_speed             0
lat                    0
lon                    0
dtype: int64