-
Notifications
You must be signed in to change notification settings - Fork 7
/
SIRTA_OTT2.py
139 lines (126 loc) · 5.41 KB
/
SIRTA_OTT2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
# -----------------------------------------------------------------------------.
# Copyright (c) 2021-2023 DISDRODB developers
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------.
from disdrodb.l0 import run_l0a
from disdrodb.l0.l0_reader import is_documented_by, reader_generic_docstring
@is_documented_by(reader_generic_docstring)
def reader(
raw_dir,
processed_dir,
station_name,
# Processing options
force=False,
verbose=False,
parallel=False,
debugging_mode=False,
):
"""Reader."""
##------------------------------------------------------------------------.
#### - Define column names
column_names = ["TO_SPLIT"]
##------------------------------------------------------------------------.
#### - Define reader options
reader_kwargs = {}
# - Define delimiter
reader_kwargs["delimiter"] = "\\n"
# - Avoid first column to become df index !!!
reader_kwargs["index_col"] = False
# Skip first row as columns names
reader_kwargs["header"] = None
# Skip the first row (header)
reader_kwargs["skiprows"] = 1
# - Define behaviour when encountering bad lines
reader_kwargs["on_bad_lines"] = "skip"
# Define encoding
reader_kwargs["encoding"] = "latin1"
# - Define reader engine
# - C engine is faster
# - Python engine is more feature-complete
reader_kwargs["engine"] = "python"
# - Define on-the-fly decompression of on-disk data
# - Available: gzip, bz2, zip
reader_kwargs["compression"] = "infer"
# - Strings to recognize as NA/NaN and replace with standard NA flags
# - Already included: '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN',
# '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A',
# 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null'
reader_kwargs["na_values"] = ["na", "", "error"]
##------------------------------------------------------------------------.
#### - Define dataframe sanitizer function for L0 processing
def df_sanitizer_fun(df):
# Import pandas
import pandas as pd
# The delimiter ; is used for separating both the variables and the
# values of the raw spectrum. So we need to retrieve the columns
# inside the sanitizer assuming a fixed number of columns.
df = df["TO_SPLIT"].str.split(";", expand=True, n=16)
# Define the column names
column_names = [
"date",
"time",
"rainfall_rate_32bit",
"rainfall_accumulated_32bit",
"reflectivity_32bit",
"mor_visibility",
"laser_amplitude",
"number_particles",
"sensor_temperature",
"sensor_heating_current",
"sensor_battery_voltage",
"rain_kinetic_energy",
"snowfall_rate",
"weather_code_synop_4680",
"weather_code_metar_4678",
"weather_code_nws",
"raw_drop_number",
]
df.columns = column_names
# Define the time column
df["time"] = df["date"] + "-" + df["time"]
df["time"] = pd.to_datetime(df["time"], format="%Y/%m/%d-%H:%M:%S", errors="coerce")
df = df.drop(columns=["date"])
# Preprocess the raw spectrum
# - The '<SPECTRUM>ZERO</SPECTRUM>' indicates no drops detected
# --> "" generates an array of zeros in L0B processing
df["raw_drop_number"] = df["raw_drop_number"].str.replace("<SPECTRUM>ZERO</SPECTRUM>", "")
# Remove <SPECTRUM> and </SPECTRUM>" acronyms from the raw_drop_number field
df["raw_drop_number"] = df["raw_drop_number"].str.replace("<SPECTRUM>", "")
df["raw_drop_number"] = df["raw_drop_number"].str.replace("</SPECTRUM>", "")
# Add 0 before every ; if ; not preceded by a digit
# Example: ';;1;;' --> '0;0;1;0;'
df["raw_drop_number"] = df["raw_drop_number"].str.replace(r"(?<!\d);", "0;", regex=True)
return df
##------------------------------------------------------------------------.
#### - Define glob pattern to search data files in <raw_dir>/data/<station_name>
glob_patterns = "*.txt" # There is only one file without extension
####----------------------------------------------------------------------.
#### - Create L0A products
run_l0a(
raw_dir=raw_dir,
processed_dir=processed_dir,
station_name=station_name,
# Custom arguments of the reader for L0A processing
glob_patterns=glob_patterns,
column_names=column_names,
reader_kwargs=reader_kwargs,
df_sanitizer_fun=df_sanitizer_fun,
# Processing options
force=force,
verbose=verbose,
parallel=parallel,
debugging_mode=debugging_mode,
)