In [1]:
import os
from os.path import join
from tqdm.notebook import tqdm
import multiprocessing

import numpy as np
import pandas as pd

# Path
ROOT_DIR_PATH = '../D_WEATHER/input'
TRAIN_DIR_PATH = join(ROOT_DIR_PATH, 'train')
TEST_DIR_PATH = join(ROOT_DIR_PATH, 'test')
TRAIN_FEATHER_PATH = join(ROOT_DIR_PATH, 'train.ftr')
TEST_FEATHER_PATH = join(ROOT_DIR_PATH, 'test.ftr')

# Column(feature) names
INPUT_COL = [f"temp{i}" for i in range(1, 10)] + ['type', 'long_GMI', 'lat_GMI', 'long_DPR', 'lat_DPR']
TARGET_COL = ['precipitation']
TEST_COL = INPUT_COL
TRAIN_COL = INPUT_COL + TARGET_COL

# Name column (optional)
IS_PAD_NAME_COL = False
NAME_COL = ['orbit', 'subset', 'pixel']
PIXEL_COL = np.arange(1, 1601)[:, None]


def pad_name_cols(nd, file_name):
    orbit, subset = file_name.split('_')[1:]
    subset = subset[:2]
    nd = np.pad(nd, ((0, 0), (0, 1)), constant_values=int(orbit))
    nd = np.pad(nd, ((0, 0), (0, 1)), constant_values=int(subset))
    return np.c_[nd, PIXEL_COL]

def generate_ndarray_from_file_name(file_name, dir_path):
    file_path = join(dir_path, file_name)
    nd = np.load(file_path).astype(np.float32)  # 40 x 40 x ?
    dim = nd.shape[-1]
    nd = nd.reshape(-1, dim)                    # 1600    x ?
    if IS_PAD_NAME_COL:
        nd = pad_name_col(nd, file_name)
    return nd
    
def generate_ndarray_from_dir_path(dir_path):
    pool = multiprocessing.Pool()
    nds = pool.starmap(generate_ndarray_from_file_name, [(file_name, dir_path) for file_name in tqdm(os.listdir(dir_path))])
    return np.concatenate(nds)


def generate_dataframe_from_dir_path(dir_path):
    nd = generate_ndarray_from_dir_path(dir_path)
    dim = nd.shape[-1]
    df =  pd.DataFrame(nd,
                       columns=TRAIN_COL if dim == len(TRAIN_COL) else TEST_COL,
                       dtype=np.float32
                      )
    if IS_PAD_NAME_COL:
        df[['orbit', 'subset', 'pixel']] = df[['orbit', 'subset', 'pixel']].astype(np.int32)
        df.sort_values(by=['orbit', 'subset', 'pixel'], ignore_index=True, inplace=True)
    return df


def main():
    global TRAIN_COL, TEST_COL
    TRAIN_COL, TEST_COL = (TRAIN_COL, TEST_COL) if not IS_PAD_NAME_COL else (TRAIN_COL + NAME_COL, TEST_COL + NAME_COL)
    for dir_path in (TRAIN_DIR_PATH, TEST_DIR_PATH):
        df = generate_dataframe_from_dir_path(dir_path)
        if len(df.columns) == len(TRAIN_COL):
            train_df = df
        else:
            test_df = df
    return train_df, test_df

def to_feather(train_df, test_df):
    train_df.to_feather(TRAIN_FEATHER_PATH)
    test_df.to_feather(TEST_FEATHER_PATH)

def read_feather():
    train_df = pd.read_feather(TRAIN_FEATHER_PATH)
    test_df = pd.read_feather(TEST_FEATHER_PATH)
    return train_df, test_df

!pip install memory-profiler

!pip install line-profiler

In [5]:
%load_ext memory_profiler
%load_ext line_profiler

In [6]:
%%time
%memit train_df, test_df = main()
# 14s / 15GB

HBox(children=(FloatProgress(value=0.0, max=76345.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2416.0), HTML(value='')))


peak memory: 14509.57 MiB, increment: 14400.88 MiB
CPU times: user 2.55 s, sys: 8.49 s, total: 11 s
Wall time: 36.5 s


In [7]:
%%time
%memit to_feather(train_df, test_df)

peak memory: 21594.48 MiB, increment: 7220.54 MiB
CPU times: user 1min 32s, sys: 13.3 s, total: 1min 45s
Wall time: 57.8 s


In [8]:
%%time
%memit train_df, test_df = read_feather()

peak memory: 28330.79 MiB, increment: 13950.77 MiB
CPU times: user 3.48 s, sys: 6.71 s, total: 10.2 s
Wall time: 28.6 s


In [9]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122152000 entries, 0 to 122151999
Data columns (total 15 columns):
temp1            float32
temp2            float32
temp3            float32
temp4            float32
temp5            float32
temp6            float32
temp7            float32
temp8            float32
temp9            float32
type             float32
long_GMI         float32
lat_GMI          float32
long_DPR         float32
lat_DPR          float32
precipitation    float32
dtypes: float32(15)
memory usage: 6.8 GB


Unnamed: 0,temp1,temp2,temp3,temp4,temp5,temp6,temp7,temp8,temp9,type,long_GMI,lat_GMI,long_DPR,lat_DPR,precipitation
0,276.345306,269.061676,275.768982,267.590973,278.918488,276.057617,267.718536,281.058044,276.29776,105.0,111.984665,27.743324,112.112061,27.672855,0.0
1,274.954651,266.84201,275.510803,266.518219,279.108368,276.858185,268.559814,282.791229,279.515442,105.0,112.028824,27.709145,112.112061,27.672855,0.0
2,273.050293,264.990906,274.355103,264.183319,278.565918,276.398529,267.664001,282.788483,279.462555,105.0,112.073425,27.675425,112.112061,27.672855,0.0
3,273.705475,264.647308,274.066589,263.972961,278.802643,275.811066,267.545715,282.927765,280.419617,102.0,112.118446,27.64217,112.091187,27.632555,0.0
4,273.936523,265.798096,275.803589,268.57196,279.572601,277.099731,269.820526,282.501099,280.374359,102.0,112.163895,27.609385,112.139694,27.607765,0.0


In [11]:
train_df.describe()

Unnamed: 0,temp1,temp2,temp3,temp4,temp5,temp6,temp7,temp8,temp9,type,long_GMI,lat_GMI,long_DPR,lat_DPR,precipitation
count,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0,122152000.0
mean,197.3029,139.9296,217.1046,169.6791,239.5907,233.3365,192.1457,264.3873,245.8601,31.9632,139.5868,30.91759,139.5868,30.9175,-44.369
std,43.4096,72.34476,33.94657,58.50119,26.87159,24.32388,46.63368,18.97327,29.62355,54.10535,19.8127,14.27054,19.81248,14.27062,665.7127
min,101.6668,45.82756,166.6451,90.73691,168.6818,127.9539,119.6357,64.70033,63.10448,0.0,103.6093,4.078568,103.5896,2.442523,-9999.9
25%,169.5079,92.82653,192.6689,127.1468,221.565,215.6677,157.3114,253.3665,227.7993,0.0,122.4348,18.60782,122.4392,18.60756,0.0
50%,173.9428,97.10215,205.9318,146.6576,243.2162,226.467,174.8229,270.8615,255.6745,0.0,139.833,31.28326,139.8316,31.2829,0.0
75%,246.6735,226.7323,247.2664,232.5092,255.733,249.0251,233.3685,278.2477,267.5714,103.0,156.7187,43.45076,156.7155,43.45127,0.0
max,1576.792,1531.369,1173.939,1131.579,342.248,742.0804,452.5078,308.402,369.6122,322.0,176.284,55.84636,176.2885,55.82747,299.9978


In [12]:
train_df[train_df['precipitation']<0].shape

(543756, 15)