In this notebook we will guide you on how to generate a pandas database using the source data given by Prof. Kitamoto. To this end, we will be using the class `PDManager` from `pyphoon.db.pd_manager`.

In [1]:
import sys
sys.path.insert(0, '..')
from os.path import exists, join
from os import makedirs
from pyphoon.db import pd_manager

Let us first define a batch of paths, used to retrieve the original data and store the new data.

In [2]:
# Paths to source data
orig_images_dir = '../original_data/image/'
besttrack_dir = '../original_data/jma/'
# Path to new database files
db_dir = '../data/database'
# Create database directory if does not exist
if not exists(db_dir):
    makedirs(db_dir)
# Path where corrected images are to be stored
corrected_dir = '../data/database/corrected'
# Path where generated images are to be stored
generated_dir = '../data/database/generated'
# Pickle files (used to store dataframes)
images_pkl_path = join(db_dir, 'images.pkl')
corrupted_pkl_path = join(db_dir, 'corrupted.pkl')
besttrack_pkl_path = join(db_dir, 'besttrack.pkl')
missing_pkl_path = join(db_dir, 'missing.pkl')

## Original images database

Let us first create an instance of `PDManager` and load the original image data using its method `add_orig_images`. This method creates a table in `PDManager` with information from the original images. We then use the method `save_orig_images` to store the generated table. In subsequent calls, we might want to directly use the method `load_orig_images` to load the table from a previously stored pickle file.

In [3]:
# Instance of PDManager
pd_man = pd_manager.PDManager()

# Load original image data
if not exists(images_pkl_path):
    print('Images database file not found, creating new...')
    pd_man.add_orig_images(orig_images_dir)
    pd_man.save_images(images_pkl_path)
    print('Done.')
else:
    pd_man.load_images(images_pkl_path)

In [4]:
pd_man.images.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 164627 entries, (197830, 1978-12-01 00:00:00) to (201727, 2017-12-26 06:00:00)
Data columns (total 3 columns):
directory    164627 non-null object
filename     164627 non-null object
size         164627 non-null int64
dtypes: int64(1), object(2)
memory usage: 26.2 MB


## Besttrack database

In [5]:
if not exists(besttrack_pkl_path):
    print('Besttrack database file not found, creating new...')
    pd_man.add_besttrack(besttrack_dir)
    pd_man.save_besttrack(besttrack_pkl_path)
    print('Done.')
else:
    pd_man.load_besttrack(besttrack_pkl_path)

In [6]:
pd_man.besttrack.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 361579 entries, (195101, 1951-02-19 06:00:00) to (201726, 2017-12-23 18:00:00)
Data columns (total 16 columns):
class                 361579 non-null int64
latitude              361579 non-null float64
longitude             361579 non-null float64
pressure              361579 non-null float64
wind                  361579 non-null float64
gust                  361579 non-null float64
storm_direc           361579 non-null int64
storm_radius_major    361579 non-null int64
storm_radius_minor    361579 non-null int64
gale_direc            361579 non-null int64
gale_radius_major     361579 non-null int64
gale_radius_minor     361579 non-null int64
landfall              361579 non-null int64
speed                 361579 non-null int64
direction             361579 non-null int64
interpolated          361579 non-null int64
dtypes: float64(5), int64(11)
memory usage: 48.1 MB


## Generate new datasets

In [7]:
from pyphoon.clean_satellite.correction import correct_corrupted_pixels_1
from pyphoon.clean_satellite.detection import detect_corrupted_pixels_1
from pyphoon.clean_satellite.generation import generate_new_frames_1
from pyphoon.clean_satellite.fix import TyphoonListImageFixAlgorithm

In [8]:
# Define Fixing algorithm
fix_algorithm = TyphoonListImageFixAlgorithm(
    detect_fct=detect_corrupted_pixels_1,
    correct_fct=correct_corrupted_pixels_1,
    generate_fct=generate_new_frames_1,
    detect_params={'min_th': 160, 'max_th': 310},
    n_frames_th=3
)

In [None]:
from pyphoon.clean_satellite.fix import generate_new_image_dataset
generate_new_image_dataset(
    orig_images_dir,
    fix_algorithm,
    images_corrected_dir=corrected_dir,
    images_generated_dir=generated_dir,
    display=True
)

197830
197901
197902
197903
197904
197905
197906
197907
197908
197909
197910
197911
197912
197913
197914
197915
197916
197917
197918
197919
197920
197921
197922
197923
198101
198102
198103
198104
198105
198106
198107
198108
198109
198110
198111
198112
198113
198114
198115
198116
198117
198118
198119
198120
198121
198122
198123
198124
198125
198126
198127
198128
198129
198201
198202
198203
198204
198205
198206
198207
198208
198209
198210
198211
198212
198213
198214
198215
198216
198217
198218
198219
198220
198221
198222
198223
198224
198225
198301
198302
198303
198304
198305
198306
198307
198308
198309
198310
198311
198312
198313
198314
198315
198316


In [None]:
if not exists(corrupted_pkl_path):
    print('Corrupted database file not found, creating new...')
    pd_man.add_corrupted(images_dir=corrected_dir)
    pd_man.save_corrupted(corrupted_pkl_path)
    print('Done.')
else:
    pd_man.load_corrupted(corrupted_pkl_path)

In [18]:
pd_man.corrupted.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4788 entries, (197901, 1979-01-03 16:00:00) to (201717, 2017-09-04 15:00:00)
Data columns (total 3 columns):
directory    4788 non-null object
filename     4788 non-null object
size         4788 non-null int64
dtypes: int64(1), object(2)
memory usage: 814.1 KB


In [None]:
pd_man.add_corarected_info(corrected_dir=corrected_dir, orig_images_dir=orig_images_dir)

In [16]:
pd_man.corrupted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,corrupted,corruption
seq_no,obs_time,Unnamed: 2_level_1,Unnamed: 3_level_1
197901,1979-01-03 16:00:00,197901_1979010316,0.004311
197901,1979-01-06 21:00:00,197901_1979010621,0.002281
197901,1979-01-07 06:00:00,197901_1979010706,4e-06
197902,1979-03-22 03:00:00,197902_1979032203,0.000328
197902,1979-03-22 06:00:00,197902_1979032206,0.000103


In [17]:
pd_man.save_corrupted(corrupted)