# Mount Google Drive for data, install libs

In [None]:
import os
os.rename("2022-Project-94", "HailProject")

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install cfgrib
!pip install eccodes 
!pip install ecmwflibs
!pip install xarray
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cfgrib
  Downloading cfgrib-0.9.10.1-py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 2.6 MB/s 
Collecting eccodes>=0.9.8
  Downloading eccodes-1.4.2.tar.gz (55 kB)
[K     |████████████████████████████████| 55 kB 4.3 MB/s 
Collecting findlibs
  Downloading findlibs-0.0.2.tar.gz (6.2 kB)
Building wheels for collected packages: eccodes, findlibs
  Building wheel for eccodes (setup.py) ... [?25l[?25hdone
  Created wheel for eccodes: filename=eccodes-1.4.2-py3-none-any.whl size=39817 sha256=665901a37d07e90708ec889c3d94a1502ba89a3543870a19c6d5509e0c5fbc61
  Stored in directory: /root/.cache/pip/wheels/5a/c4/e7/37b9d4a30e03d404d4e2f9a280deea683d631f370384a7d500
  Building wheel for findlibs (setup.py) ... [?25l[?25hdone
  Created wheel for findlibs: filename=findlibs-0.0.2-py3-none-any.whl size=6560 sha256=873b0c2352ff1c4a276c29869f4a6200396ecc548db9711d

# Imports, paths, train data and train target

In [None]:
from HailProject.code.grid_model.model import train_model
import HailProject.code.src.data_processing as dp
import HailProject.code.src.prepare_target as pt

In [None]:
aerology_path = "drive/MyDrive/hail_data/ERA5_Texas/cutted_aerology"
land_path = "drive/MyDrive/hail_data/ERA5_Texas/land"
runoff_path = "drive/MyDrive/hail_data/ERA5_Texas/runoff_only"
extra_feature_path = "drive/MyDrive/hail_data/ERA5_Texas/land"
target_path = "drive/MyDrive/hail_data/target_files"

In [None]:
full_train_days = dp.prepare_full_train_data(
    aerology_path,
    land_path,
    runoff_path,
    extra_feature_path,
    one_day=False
)
print("Training data:")
print("dims: (n_days, n_features, lat, long): ", full_train_days.shape)

Ignoring index file 'drive/MyDrive/hail_data/ERA5_Texas/runoff_only/adaptor.mars.internal-1657649580.0889146-9595-8-628a165f-5da8-4aad-a03f-d81307c30dbc.grib.923a8.idx' incompatible with GRIB file
Ignoring index file 'drive/MyDrive/hail_data/ERA5_Texas/cutted_aerology/adaptor.mars.internal-1657555514.9637768-23951-18-63c4f4e2-a2b7-47ca-927e-962f3990f7ea.grib.923a8.idx' incompatible with GRIB file


Training data:
dims: (n_days, n_features, lat, long):  (2192, 42, 41, 65)


In [None]:
target_grid = pt.prepare_target_grid(target_path, (27., 37.), (-109, -93)) 
print("Training target")
print("dims: (n_days, lat, long): ", target_grid.shape)

Training target
dims: (n_days, lat, long):  (2192, 41, 65)


In [None]:
# Save train data and target
import numpy as np

np.save("drive/MyDrive/hail_data/for_experiments/target_grid.npy", target_grid)
np.save("drive/MyDrive/hail_data/for_experiments/full_train_days.npy", full_train_days)

In [None]:
# Load train data and target
import numpy as np

target_grid = np.load("drive/MyDrive/hail_data/for_experiments/target_grid.npy")
full_train_days = np.load("drive/MyDrive/hail_data/for_experiments/full_train_days.npy")
print("Training data:")
print("dims: (n_days, n_features, lat, long): ", full_train_days.shape)
print("Training target")
print("dims: (n_days, lat, long): ", target_grid.shape)

Training data:
dims: (n_days, n_features, lat, long):  (2192, 42, 41, 65)
Training target
dims: (n_days, lat, long):  (2192, 41, 65)


# Train model

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

split_val = False

def train_model(train_data, target_grid, new_train: bool = True, model = None):
    if new_train:
        model = CatBoostClassifier(
            iterations=1000,
            learning_rate=0.01,
            task_type="GPU",
            scale_pos_weight=100
        )
    x_train = train_data.reshape(-1, train_data.shape[1])
    y_train = target_grid.reshape(-1)
    model.fit(x_train, y_train)
    return model
if split_val == True:
    x_train, y_train = full_train_days[:1800], target_grid[:1800]
    x_val, y_val = full_train_days[1800:], target_grid[1800:]
    model = train_model(x_train, y_train)
    x_val = x_val.reshape(-1, x_val.shape[1])
    y_val = y_val.reshape(-1)
    preds_to_val = model.predict(x_val)
    print(classification_report(y_val, preds_to_val))

In [None]:
model.save_model("/content/drive/MyDrive/hail_data/model/model")

In [3]:
from catboost import CatBoostClassifier
model = CatBoostClassifier()
model.load_model("/content/drive/MyDrive/hail_data/model/model")

<catboost.core.CatBoostClassifier at 0x7fccb45bafd0>

In [5]:
import glob
import numpy as np
dataset_path = "/content/drive/MyDrive/hail_data/CMIP/np_cmips_rcp85_r1i1p1"
def check_leap_year(year):
    if (year % 400 == 0) and (year % 100 == 0):
        return True
    elif (year % 4 ==0) and (year % 100 != 0):
        return True
    else:
        return False

def inference_model(model, dataset_path):
    paths = sorted(glob.glob(dataset_path + "/*.npy"))
    full_preds = []
    years = [year for year in range(2022, 2051)]
    for path, year in zip(paths, years):
        data = np.load(path)
        x_test = data.reshape(-1, data.shape[1])
        preds = model.predict_proba(x_test)[:, 1] 
        preds = preds.reshape((-1, data.shape[2], data.shape[3]), order="F")
        full_preds.append(np.expand_dims(np.max(preds, axis=0), axis=0))
        print("Forecasted year: ", year)
    full_preds = np.concatenate(full_preds, axis=0)
    return full_preds

preds = inference_model(model, dataset_path)

Forecasted year:  2022
Forecasted year:  2023
Forecasted year:  2024
Forecasted year:  2025
Forecasted year:  2026
Forecasted year:  2027
Forecasted year:  2028
Forecasted year:  2029
Forecasted year:  2030
Forecasted year:  2031
Forecasted year:  2032
Forecasted year:  2033
Forecasted year:  2034
Forecasted year:  2035
Forecasted year:  2036
Forecasted year:  2037
Forecasted year:  2038
Forecasted year:  2039
Forecasted year:  2040
Forecasted year:  2041
Forecasted year:  2042
Forecasted year:  2043
Forecasted year:  2044
Forecasted year:  2045
Forecasted year:  2046
Forecasted year:  2047
Forecasted year:  2048
Forecasted year:  2049
Forecasted year:  2050


In [None]:
np.save("drive/MyDrive/hail_data/model/preds.npy", preds)

In [None]:
import pandas as pd
import numpy as np
preds = np.load("/content/drive/MyDrive/hail_data/model/preds")
global predictions 
predictions = preds

def cart_prod(x, y, z):
    return np.array([[x0, y0, z0] for x0 in x for y0 in y for z0 in z])

def change_prob(row):
    """
        Вспомогательная функция для создания датасета для анимации.
    """
    years = [year for year in range(2022, 2047)]
    lat_grid = [42.05582, 43.17731, 44.29879, 45.42028, 46.54176, 47.66325, 48.78473,
       49.90621, 51.02769, 52.14917, 53.27066, 54.39214, 55.51361, 56.63509,
       57.75657, 58.87804, 59.99952, 61.12099, 62.24246, 63.36393, 64.4854 ,
       65.60686, 66.72833, 67.84978, 68.97124, 70.09269, 71.21414, 72.33558,
       73.45701, 74.57843, 75.69984, 76.82124]
    long_grid = [ 19.125,  20.25 ,  21.375,  22.5  ,  23.625,  24.75 ,  25.875,  27.   ,
        28.125,  29.25 ,  30.375,  31.5  ,  32.625,  33.75 ,  34.875,  36.   ,
        37.125,  38.25 ,  39.375,  40.5  ,  41.625,  42.75 ,  43.875,  45.   ,
        46.125,  47.25 ,  48.375,  49.5  ,  50.625,  51.75 ,  52.875,  54.   ,
        55.125,  56.25 ,  57.375,  58.5  ,  59.625,  60.75 ,  61.875,  63.   ,
        64.125,  65.25 ,  66.375,  67.5  ,  68.625,  69.75 ,  70.875,  72.   ,
        73.125,  74.25 ,  75.375,  76.5  ,  77.625,  78.75 ,  79.875,  81.   ,
        82.125,  83.25 ,  84.375,  85.5  ,  86.625,  87.75 ,  88.875,  90.   ,
        91.125,  92.25 ,  93.375,  94.5  ,  95.625,  96.75 ,  97.875,  99.   ,
       100.125, 101.25 , 102.375, 103.5  , 104.625, 105.75 , 106.875, 108.   ,
       109.125, 110.25 , 111.375, 112.5  , 113.625, 114.75 , 115.875, 117.   ,
       118.125, 119.25 , 120.375, 121.5  , 122.625, 123.75 , 124.875, 126.   ,
       127.125, 128.25 , 129.375, 130.5  , 131.625, 132.75 , 133.875, 135.   ,
       136.125, 137.25 , 138.375, 139.5  , 140.625, 141.75 , 142.875, 144.   ,
       145.125, 146.25 , 147.375, 148.5  , 149.625, 150.75 , 151.875, 153.   ,
       154.125, 155.25 , 156.375, 157.5  , 158.625, 159.75 , 160.875, 162.   ,
       163.125, 164.25 , 165.375, 166.5  , 167.625, 168.75 , 169.875, 171.   ,
       172.125, 173.25 , 174.375, 175.5  , 176.625, 177.75 , 178.875, 180.   ,
       181.125, 182.25 , 183.375, 184.5  , 185.625, 186.75 , 187.875, 189.   ,
       190.125, 191.25 ]
    lat_to_idx = {}.fromkeys(lat_grid)
    long_to_idx = {}.fromkeys(lat_grid)
    year_to_idx = {}.fromkeys(years)

    lat_to_idx = {}.fromkeys(lat_grid)
    long_to_idx = {}.fromkeys(lat_grid)
    year_to_idx = {}.fromkeys(years)

    for i, lat_ in enumerate(lat_grid):
        lat_to_idx[lat_] = i
    for j, long_ in enumerate(long_grid):
        long_to_idx[long_] = j
    for k, year in enumerate(years):
        year_to_idx[year] = k
    row[3] = predictions[year_to_idx[row[0]],
                   lat_to_idx[row[2]],
                   long_to_idx[row[1]]]
    return row

def create_dataset():
    """
        Создание датасета для анимации.
    """
    years = [year for year in range(2022, 2047)]
    lat_grid = [42.05582, 43.17731, 44.29879, 45.42028, 46.54176, 47.66325, 48.78473,
       49.90621, 51.02769, 52.14917, 53.27066, 54.39214, 55.51361, 56.63509,
       57.75657, 58.87804, 59.99952, 61.12099, 62.24246, 63.36393, 64.4854 ,
       65.60686, 66.72833, 67.84978, 68.97124, 70.09269, 71.21414, 72.33558,
       73.45701, 74.57843, 75.69984, 76.82124]
    long_grid = [ 19.125,  20.25 ,  21.375,  22.5  ,  23.625,  24.75 ,  25.875,  27.   ,
        28.125,  29.25 ,  30.375,  31.5  ,  32.625,  33.75 ,  34.875,  36.   ,
        37.125,  38.25 ,  39.375,  40.5  ,  41.625,  42.75 ,  43.875,  45.   ,
        46.125,  47.25 ,  48.375,  49.5  ,  50.625,  51.75 ,  52.875,  54.   ,
        55.125,  56.25 ,  57.375,  58.5  ,  59.625,  60.75 ,  61.875,  63.   ,
        64.125,  65.25 ,  66.375,  67.5  ,  68.625,  69.75 ,  70.875,  72.   ,
        73.125,  74.25 ,  75.375,  76.5  ,  77.625,  78.75 ,  79.875,  81.   ,
        82.125,  83.25 ,  84.375,  85.5  ,  86.625,  87.75 ,  88.875,  90.   ,
        91.125,  92.25 ,  93.375,  94.5  ,  95.625,  96.75 ,  97.875,  99.   ,
       100.125, 101.25 , 102.375, 103.5  , 104.625, 105.75 , 106.875, 108.   ,
       109.125, 110.25 , 111.375, 112.5  , 113.625, 114.75 , 115.875, 117.   ,
       118.125, 119.25 , 120.375, 121.5  , 122.625, 123.75 , 124.875, 126.   ,
       127.125, 128.25 , 129.375, 130.5  , 131.625, 132.75 , 133.875, 135.   ,
       136.125, 137.25 , 138.375, 139.5  , 140.625, 141.75 , 142.875, 144.   ,
       145.125, 146.25 , 147.375, 148.5  , 149.625, 150.75 , 151.875, 153.   ,
       154.125, 155.25 , 156.375, 157.5  , 158.625, 159.75 , 160.875, 162.   ,
       163.125, 164.25 , 165.375, 166.5  , 167.625, 168.75 , 169.875, 171.   ,
       172.125, 173.25 , 174.375, 175.5  , 176.625, 177.75 , 178.875, 180.   ,
       181.125, 182.25 , 183.375, 184.5  , 185.625, 186.75 , 187.875, 189.   ,
       190.125, 191.25 ]

    np_ds = cart_prod(years, long_grid, lat_grid)
    dataset = pd.DataFrame({'year': np_ds[:, 0], 'lon': np_ds[:, 1], 'lat': np_ds[:, 2]})
    dataset["Probability_of_hail"] = None
    dataset = dataset.apply(change_prob, axis = 1) 
    return dataset

In [None]:
dataset_for_drawing = create_dataset()

In [None]:
dataset_for_drawing.to_csv("dataset_for_drawing_5years_sm.csv")