In [3]:
# Import
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('/home/user/workdir/main/src/'))))

import numpy as np
import pandas as pd
import netCDF4 as nc
from netCDF4 import Dataset

from sklearn.model_selection import train_test_split
import tensorflow as tf
from src.model.cmaqnet_cond_unet import build_model
from sklearn.metrics import mean_squared_error, r2_score

import geopandas as gpd
from shapely.geometry import Point
import matplotlib as mpl
import matplotlib.pyplot as plt

2025-02-10 17:27:36.597444: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-10 17:27:36.627663: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-10 17:27:36.627694: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-10 17:27:36.627713: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-10 17:27:36.633734: I tensorflow/core/platform/cpu_feature_g

In [4]:
# 2D Map 결과 도출을 위한 지도 파라미터 세팅
proj = '+proj=lcc +lat_1=30 +lat_2=60 +lon_1=126 +lat_0=38 +lon_0=126 +ellps=GRS80 +units=m'
atob = {
    0: 'G', 1: 'F', 2: 'K', 3: 'J', 4: 'E', 5: 'D',
    6: 'O', 7: 'C', 8: 'A', 9: 'Q', 10: 'P', 11: 'B',
    12: 'M', 13: 'L', 14: 'N', 15: 'I', 16: 'H'}
region_columns = {
    'A': 'Seoul City', 'B': 'Incheon City', 'C': 'Busan City', 'D': 'Daegu City',
    'E': 'Gwangju City', 'F': 'Gyeonggi-do', 'G': 'Gangwon-do', 'H': 'Chungbuk-do',
    'I': 'Chungnam-do', 'J': 'Gyeongbuk-do', 'K': 'Gyeongnam-do', 'L': 'Jeonbuk-do',
    'M': 'Jeonnam-do', 'N': 'Jeju-do', 'O': 'Daejeon City', 'P': 'Ulsan City', 'Q': 'Sejong City'}

def get_ctprvn_map() -> gpd.GeoDataFrame:
    path = '/home/user/workdir/main/resources/geom/ctp_rvn.shp'
    ctprvn = gpd.GeoDataFrame.from_file(path, encoding='cp949')
    ctprvn.crs = 'EPSG:5179'
    return ctprvn

def get_base_raster(ctprvn:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    points = [Point(i, j)
                for i in range(-180000, -180000 + 9000 * 67, 9000)
                for j in range(-585000, -585000 + 9000 * 82, 9000)]
    grid_data = gpd.GeoDataFrame(points, geometry='geometry', columns=['geometry'])
    grid_data.crs = ctprvn.to_crs(proj).crs
    grid_data.loc[:,'x_m'] = grid_data.geometry.x
    grid_data.loc[:,'y_m'] = grid_data.geometry.y
    grid_data.loc[:,'value'] = 0
    grid_data.loc[:,'index'] = grid_data.index
    return grid_data

def get_region_pixel_indices() -> list:
    ctprvn = get_ctprvn_map()
    grid_data = get_base_raster(ctprvn)

    cities = {
        0: '강원도', 1: '경기도', 2: '경상남도', 3: '경상북도',
        4: '광주광역시', 5: '대구광역시', 6: '대전광역시', 7: '부산광역시',
        8: '서울특별시', 9: '세종특별자치시', 10: '울산광역시', 11: '인천광역시',
        12: '전라남도', 13: '전라북도', 14: '제주특별자치도', 15: '충청남도',
        16: '충청북도'
    }

    gdf_joined_loc = ['CTPRVN_CD', 'CTP_ENG_NM', 'CTP_KOR_NM', 'index_right0']
    gdf_joined = gpd.sjoin(ctprvn, grid_data.to_crs(5179), predicate='contains')

    indices = gpd.GeoDataFrame(pd.merge(
        left=grid_data, right=gdf_joined.loc[:,gdf_joined_loc], 
        how='left', left_on='index', right_on='index_right0'
    ), geometry='geometry').dropna()
    pixel_indices = \
        [[(idx%82, idx//82) for idx in indices.loc[indices.CTP_KOR_NM==cities[region]].index.tolist()]
         for region, _ in cities.items()]
    return pixel_indices

ctprvn = get_ctprvn_map()
ctprvn_proj = ctprvn.to_crs(proj)

# 건국대(서울대)에서 제공한 대한민국 국토 grid 정보
grid_alloc = (
    pd.read_csv('/home/user/workdir/main/resources/geom/grid_allocation.csv') # load grid allocation data
    .sort_values(by=['Row', 'Column', 'Ratio'], ascending=[True, True, False]) # sort by row, column, ratio
    .drop_duplicates(subset=['Row', 'Column'], keep='first') # drop duplicates
    .reset_index(drop=True) # reset index
)

# 정부에서 배포하는 대한민국 국토 grid 정보
pixel_indices = get_region_pixel_indices()
total_index = []
for idx, grids in enumerate(pixel_indices):
    for grid in grids:
        total_index.append([
            grid[1], grid[0], 100.0, atob[idx], region_columns[atob[idx]]
        ])
total_index = pd.DataFrame(total_index, columns=grid_alloc.columns)

# 두 기관에서 주는 grid에 한두픽셀씩 비어있는 부분이 있어, 두 기관 데이터를 모두 참조하여 중복되는 픽셀을 정리 후 최종 국토 grid 정보를 취득
grid_alloc = pd.concat([
    grid_alloc.drop(columns=['Ratio', 'Region_Name']),
    total_index.drop(columns=['Ratio', 'Region_Name'])
]).sort_values(by=['Region_Code']).drop_duplicates().reset_index(drop=True)
grid_alloc[['Row', 'Column']] = grid_alloc[['Row', 'Column']] - 1

row_indices, col_indices = zip(*grid_alloc[['Row', 'Column']].values)
offset_x, offset_y = 4500, 4500 # 지도 위치 맞추기

# 마스킹 처리
mask = np.zeros((82, 67))
mask[row_indices, col_indices] = 1

cmap_white = mpl.colormaps['jet']
cmap_white.set_under('white')

In [13]:
def find_outliers_iqr(series, iqr_factor=1.5):
    """
    series: pd.Series
    iqr_factor: 1.5가 기본. (3.0으로 올리면 더 엄격히 outlier 판단)
    
    Returns
    -------
    pd.Series (dtype=bool), True이면 이상치
    """
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - (iqr_factor * IQR)
    upper_bound = Q3 + (iqr_factor * IQR)
    
    outliers_mask = (series < lower_bound) | (series > upper_bound)
    return outliers_mask

In [14]:
df = pd.read_csv('/home/user/workdir/main/resources/ctrl/precursor_control_2019_4input_scaled_o3.csv')
o3_cols = [col for col in df.columns if "O3_ALL" in col]
df_o3 = df[o3_cols].copy()

In [16]:
overall_outliers_mask = np.zeros(len(df_o3), dtype=bool)

for col in df_o3.columns:
    mask_outliers_col = find_outliers_iqr(df_o3[col], iqr_factor=1.5)
    # outlier로 판정된 행(True)는 OR(|=)로 합치기
    overall_outliers_mask |= mask_outliers_col

# 최종적으로 'O3 컬럼 중 하나라도 IQR 범위를 벗어난' 행
df_outliers_o3 = df_o3[overall_outliers_mask]

print("O3 열 중 하나라도 outlier로 판정된 행 개수:", len(df_outliers_o3))
print(df_outliers_o3.head())


O3 열 중 하나라도 outlier로 판정된 행 개수: 4
     A_O3_ALL  B_O3_ALL  C_O3_ALL  D_O3_ALL  E_O3_ALL  F_O3_ALL  G_O3_ALL  \
0    1.000000  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
8    0.694954  1.336226  1.048912  1.086317  1.191019  1.081665  1.333931   
94   0.547840  1.272830  0.959516  1.022844  1.053369  0.915072  1.267129   
115  0.692951  1.339874  1.022420  1.049107  1.170187  1.096962  1.345424   

     H_O3_ALL  I_O3_ALL  J_O3_ALL  K_O3_ALL  L_O3_ALL  M_O3_ALL  N_O3_ALL  \
0    1.000000  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
8    1.226848  1.237194  1.288706  1.242697  1.332488  1.409358  1.470142   
94   1.098627  1.147726  1.225385  1.214498  1.275359  1.348776  1.474238   
115  1.238689  1.289676  1.303094  1.294918  1.374125  1.411715  1.488215   

     O_O3_ALL  P_O3_ALL  Q_O3_ALL  
0    1.000000  1.000000  1.000000  
8    1.155100  1.121079  1.083791  
94   1.025837  1.103844  0.946037  
115  1.158474  1.134437  1.098575  


In [20]:
overall_outliers_mask

0       True
1      False
2      False
3      False
4      False
       ...  
114    False
115     True
116    False
117    False
118    False
Name: A_O3_ALL, Length: 119, dtype: bool

In [19]:
df_inliers = df[~overall_outliers_mask]  # ~ : 반전
print("이상치 제외 후 df_inliers 크기:", df_inliers.shape)

이상치 제외 후 df_inliers 크기: (115, 85)
