## split csv files to qcv format files per year

In [1]:
import os
import pandas as pd
import numpy as np
import glob
# importing required modules
from zipfile import ZipFile

In [2]:
zip_dir = 'D:/AmeriFlux/'
all_zipfiles = glob.glob(zip_dir + "*.zip")

In [3]:
dst_dir = './ameri_sites/'

In [4]:
# Get those name from the c file "one_flux\ONEFlux\oneflux_steps\qc_auto\src\dataset.c"  const char *const var_names[]
var_names = ["CO2","H2O","ZL","FC","FC_SSITC_TEST","H","H_SSITC_TEST","LE","LE_SSITC_TEST","USTAR",
"TR","SB","SC","SLE","SH","P","SW_OUT","SW_IN","NETRAD","SW_DIF","PPFD_IN","APAR","TA","PA","T_CANOPY","T_BOLE","TS","SWC","G",
"RH","WD","WS","TAU","LW_IN","NEE","VPD","itpVPD","itpSW_IN","itpPPFD_IN","itpTA","itpTS","itpSWC","itpP","itpRH","FETCH_FILTER",
]

In [5]:
# map columns to qc_auto accepted columns name
# like CO2_1_1_1 to CO2
def pickup_col(df: pd.DataFrame) ->list:
    new_column = dict()
    for item in df.columns[2:]:
        if item in var_names:
            new_column[item] = item
        else:
            sub_items = item.split('_')
            key0 = sub_items[0]
            if key0 in var_names:
                if key0 not in new_column.keys():
                    new_column[key0] = item
                else:
                    continue
            else:
                key1 = key0 + '_'+ sub_items[1]  # According to var_name, there only have xx_xxx format besides XX_SSITC_TEST items
                if key1 in var_names:
                    if key1 not in new_column.keys():
                        new_column[key1] = item
                    else:
                        continue
                else:
                    continue
                    
    return new_column

In [6]:
#split rows per years
# return year list and start and end position for each year
def split_records(src_data: pd.DataFrame) -> (list, list, list):
    years = []
    start_pos = []
    end_pos = []

    start_year = ''
    rows = src_data['TIMESTAMP_END'].count()
    for idx in range(rows):
        if start_year == '':
            start_year = src_data.iloc[idx]['TIMESTAMP_START'][:4]
            years.append(start_year)
            start_pos.append(idx)
        else:
            if start_year == src_data.iloc[idx]['TIMESTAMP_START'][:4]:
                _end = idx
            else:
                end_pos.append(_end)
                start_year = src_data.iloc[idx]['TIMESTAMP_START'][:4]
                years.append(start_year)
                start_pos.append(idx)

    end_pos.append(_end)
    
    return years, start_pos, end_pos

In [21]:
def create_vpd(row):
    ta = row['TA']
    rh = row['RH']
    if ta == -9999 or rh == -9999:
        vpd =  -9999
    else:
        es = 0.6108 * np.exp((17.27*ta)/(ta+237.3)) * 1000
        ea = es * rh / 100.0
        vpd = (es - ea) /100.0

    return round(vpd,4)

In [22]:
def create_qcv_files(df: pd.DataFrame,site_info: pd.DataFrame, dst_name:str, site_name: str, record_interval: str):
    
    new_columns = pickup_col(df)
    
    header = ['TIMESTAMP_START','TIMESTAMP_END']
    # revert key value position
    selected_cols = dict()
    for key,value in new_columns.items():
        selected_cols[value] = key
    
    header += list(selected_cols.keys())
    
    years, start_pos, end_pos = split_records(df)
    
    #site_name = 'US-Ne2'
    for year, start, end in zip(years, start_pos, end_pos):
        # file name format is like US-ARc_qcv_2005.csv
        file_name = dst_name + site_name + '_qcv_'+ year + '.csv'
        _data = df[header].iloc[start: end+1].rename(columns=selected_cols)
        _data['VPD'] = _data.apply (lambda row: create_vpd(row) , axis=1)
        _data.to_csv(file_name,index=False)
        
        file = open(file_name,'r')
        text = file.read()
        file.close()
        file = open(file_name,'w')
        file.write('site,{}\n'.format(site_name))
        file.write('year,{}\n'.format(year))
        file.write('lat,{}\n'.format(site_info['Latitude'][0]))
        file.write('lon,{}\n'.format(site_info['Longitude'][0]))
        file.write('timezone,{}\n'.format(site_info['UTC'][0]))
        file.write('htower,{},{}\n'.format(_data.iloc[0]['TIMESTAMP_END'], 5))
        file.write('timeres,{}\n'.format(record_interval))
        file.write('sc_negl,1\n')
        file.write('notes,202305041205 qc visual comparison SY\n')
        file.write(text)
        file.close()
        
        print('...... Create file {} successful.'.format(file_name))
    
    return

In [8]:
pd_site = pd.read_csv('AmeriFlux_siteinfo.csv')
pd_site

Unnamed: 0,site_id,Latitude,Longitude,Elev,UTC
0,AR-TF1,-54.9733,-66.7335,40.0,-3.0
1,AR-TF2,-54.8269,-68.4549,60.0,-3.0
2,BR-CST,-7.9682,-38.3842,468.0,-3.0
3,BR-Npw,-16.4980,-56.4120,120.0,-4.0
4,CA-ARB,52.6950,-83.9452,90.0,-5.0
...,...,...,...,...,...
328,US-xUK,39.0404,-95.1921,335.0,-6.0
329,US-xUN,46.2339,-89.5373,518.0,-6.0
330,US-xWD,47.1282,-99.2414,579.0,-6.0
331,US-xWR,45.8205,-121.9519,407.0,-8.0


In [23]:
for zip_file in all_zipfiles:
    file_name = zip_file.split('.')[0]
    site_name = file_name.split('_')[1]
    site_info = pd_site[pd_site['site_id'] == site_name]
    
    dst_name = dst_dir + site_name + '/01_qc_visual/qcv_files/'
    if not os.path.exists(dst_name):
        os.makedirs(dst_name)

    with ZipFile(zip_file, 'r') as zip_folder:
        file_list = zip_folder.namelist()
        if len(file_list) > 1:
            for _file in file_list:
                name_split = _file.split('.')

                if name_split[1] == 'csv':
                    df = pd.read_csv(zip_folder.open(_file),dtype={'TIMESTAMP_START':str, 'TIMESTAMP_END':str},skiprows=2)
                    if df.iloc[0]['TIMESTAMP_END'][-4:] == '0030':
                        record_interval = 'halfhourly'
                    else:
                        record_interval = 'hourly'
                        
                    print('Process zip file {}........'.format(zip_file))
                    create_qcv_files(df,site_info, dst_name, site_name, record_interval)
                    

    break

Process zip file D:/AmeriFlux\AMF_AR-TF1_BASE-BADM_2-5.zip........
...... Create file ./ameri_sites/AR-TF1/01_qc_visual/qcv_files/AR-TF1_qcv_2016.csv successful.
...... Create file ./ameri_sites/AR-TF1/01_qc_visual/qcv_files/AR-TF1_qcv_2017.csv successful.
...... Create file ./ameri_sites/AR-TF1/01_qc_visual/qcv_files/AR-TF1_qcv_2018.csv successful.


### Below code for easy debeuggin

In [None]:
src_file = 'US-Ne2/AMF_US-Ne2_BASE_HR_9-5.csv'
src_data = pd.read_csv(src_file, dtype={'TIMESTAMP_START':str, 'TIMESTAMP_END':str},skiprows=2)
src_data

In [None]:
src_data.iloc[0]['TIMESTAMP_END']

In [None]:
df = src_data

In [None]:
new_column = dict()
for item in df.columns[2:]:
    if item in var_names:
        new_column[item] = item
    else:
        sub_items = item.split('_')
        key0 = sub_items[0]
        if key0 in var_names:
            if key0 not in new_column.keys():
                new_column[key0] = item
            else:
                continue
        else:
            key1 = key0 + '_'+ sub_items[1]  # According to var_name, there only have xx_xxx format besides XX_SSITC_TEST items
            if key1 in var_names:
                if key1 not in new_column.keys():
                    new_column[key1] = item
                else:
                    continue
            else:
                continue

In [None]:
header = ['TIMESTAMP_START','TIMESTAMP_END']
selected_cols = dict()
for key,value in new_column.items():
    selected_cols[value] = key

In [None]:
selected_cols

In [None]:
header += list(selected_cols.keys())
header

In [None]:
rows = src_data['TIMESTAMP_END'].count()
rows

In [None]:
src_data.iloc[1]['TIMESTAMP_START'][:4]

In [None]:
years = []
start_pos = []
end_pos = []

start_year = ''

for idx in range(rows):
    if start_year == '':
        start_year = src_data.iloc[idx]['TIMESTAMP_START'][:4]
        years.append(start_year)
        start_pos.append(idx)
    else:
        if start_year == src_data.iloc[idx]['TIMESTAMP_START'][:4]:
            _end = idx
        else:
            end_pos.append(_end)
            start_year = src_data.iloc[idx]['TIMESTAMP_START'][:4]
            years.append(start_year)
            start_pos.append(idx)
            
end_pos.append(_end)

In [None]:
years, start_pos, end_pos

In [None]:
site_name = 'US-Ne2'
for year, start, end in zip(years, start_pos, end_pos):
    # file name format is like US-ARc_qcv_2005.csv
    file_name = site_name + '_qcv_'+ year + '.csv'
    _data = src_data[header].iloc[start: end+1].rename(columns=selected_cols)
    _data.to_csv('./ameri_sites/US-Ne2/'+ file_name,float_format='%.5f', index=False)

In [None]:
selected_cols

In [None]:
_tmp = src_data[header].iloc[100: 200+1].rename(columns=selected_cols)
_tmp

In [None]:
_tmp['TA'] = np.random.rand(101)
_tmp['RH'] = np.random.rand(101)

In [None]:
def create_vpd(row):
    ta = row['TA']
    rh = row['RH']
    if ta == -9999 or rh == -9999:
        vpd =  -9999
    else:
        es = 0.6108 * np.exp((17.27*ta)/(ta+237.3)) * 1000
        ea = es * rh / 100.0
        vpd = (es - ea) /100.0

    return vpd
_tmp['VPD'] = _tmp.apply (lambda row: create_vpd(row) , axis=1)

In [None]:
_tmp.columns