<a href="https://colab.research.google.com/github/namwootree/Portfolio/blob/main/Competition/Dacon/Lettuce/Modularize_Preprocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

## Library

In [1]:
import pandas as pd
import numpy as np

import os
import glob
from tqdm.auto import tqdm

In [2]:
import warnings
warnings.filterwarnings(action='ignore') 

# Load Data

## Google Drive Mount

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Unzip File

In [4]:
!unzip -qq '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/상추의 생육 환경 생성/data/lettuce.zip' -d '/content'

## Load Train Data Set

In [4]:
path_train_input = '/content/train_input/'
path_train_target = '/content/train_target/'

In [6]:
all_input_list = sorted(glob.glob(path_train_input + '*.csv'))
all_target_list = sorted(glob.glob(path_train_target + '*.csv'))

In [7]:
all_input_list[:3], all_target_list[:3]

(['/content/train_input/CASE_01.csv',
  '/content/train_input/CASE_02.csv',
  '/content/train_input/CASE_03.csv'],
 ['/content/train_target/CASE_01.csv',
  '/content/train_target/CASE_02.csv',
  '/content/train_target/CASE_03.csv'])

In [8]:
df_train = pd.read_csv(all_input_list[0])

In [9]:
df_taget = pd.read_csv(all_target_list[0])

In [10]:
display(df_train.head(3), df_taget.head(3))

Unnamed: 0,DAT,obs_time,내부온도관측치,내부습도관측치,co2관측치,ec관측치,시간당분무량,일간누적분무량,시간당백색광량,일간누적백색광량,시간당적색광량,일간누적적색광량,시간당청색광량,일간누적청색광량,시간당총광량,일간누적총광량
0,0,00:00,25.3,81.835,536.016667,1.407439,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
1,0,01:00,25.680357,81.264286,528.696429,1.409003,126.0,126.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
2,0,02:00,25.273333,81.471666,532.833333,1.406913,0.0,126.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0


Unnamed: 0,DAT,predicted_weight_g
0,1,0.167719
1,2,0.181787
2,3,0.265921


# Preprocessing

## Merge Features / Target

In [11]:
!mkdir DATA

In [12]:
for path_X, path_y in tqdm(zip(all_input_list, all_target_list)):

  df_X = pd.read_csv(path_X)
  df_y = pd.read_csv(path_y)

  df_y['DAT'] = df_y['DAT'].apply(lambda x : x-1)

  df_merge = pd.merge(df_X, df_y, how='left', on='DAT')

  CASE_FILE_NAME = path_y.split('/')[-1]
  CASE_NUM = CASE_FILE_NAME.split('.')[0]

  df_merge['CASE_NUM'] = CASE_NUM

  col1 = df_merge.columns[-1:].to_list()
  col2 = df_merge.columns[:-1].to_list()

  df_merge = df_merge[col1 + col2]

  path_save = '/content/DATA/'
  df_merge.to_csv(path_save+CASE_FILE_NAME)

print('DONE')

0it [00:00, ?it/s]

DONE


## Concat Case Data

In [21]:
list_Data = os.listdir('/content/DATA')
list_Data = sorted(list_Data)

In [27]:
df_total = pd.DataFrame(columns=df_merge.columns)

In [29]:
for case in list_Data:

  df = pd.read_csv(path_save+case)
  df.drop(['Unnamed: 0'],axis=1, inplace =True)
  
  df_total = pd.concat([df_total, df], axis=0)

In [33]:
df_total.to_csv('/content/total_case_train.csv')

In [58]:
df_total.shape

(18816, 18)

# Modularize

In [5]:
%%writefile lettuce_preprocessing.py 
import pandas as pd
import numpy as np

import os
import glob
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

class lettuce_train_set_preprocessing():

  def merge_X_y(self,
                path_train_input,
                path_train_target,
                path_save
                ):
                
    all_input_list = sorted(glob.glob(path_train_input + '/' + '*.csv'))
    all_target_list = sorted(glob.glob(path_train_target + '/' + '*.csv'))

    for path_X, path_y in tqdm(zip(all_input_list, all_target_list)):

      df_X = pd.read_csv(path_X)
      df_y = pd.read_csv(path_y)

      df_y['DAT'] = df_y['DAT'].apply(lambda x : x-1)

      df_merge = pd.merge(df_X, df_y, how='left', on='DAT')

      CASE_FILE_NAME = path_y.split('/')[-1]
      CASE_NUM = CASE_FILE_NAME.split('.')[0]

      df_merge['CASE_NUM'] = CASE_NUM

      col1 = df_merge.columns[-1:].to_list()
      col2 = df_merge.columns[:-1].to_list()

      df_merge = df_merge[col1 + col2]
      df_merge.to_csv(path_save+ '/' +CASE_FILE_NAME)

    print('DONE')

  
  def concat_case_data(self,
                       path_save):

    list_Data = os.listdir(path_save)
    list_Data = sorted(list_Data)

    df_total = pd.DataFrame(columns=['CASE_NUM', 'DAT', 'obs_time', '내부온도관측치', '내부습도관측치', 'co2관측치', 'ec관측치',
       '시간당분무량', '일간누적분무량', '시간당백색광량', '일간누적백색광량', '시간당적색광량', '일간누적적색광량',
       '시간당청색광량', '일간누적청색광량', '시간당총광량', '일간누적총광량', 'predicted_weight_g'])

    for case in list_Data:

      df = pd.read_csv(path_save + '/' + case)
      df.drop(['Unnamed: 0'],axis=1, inplace =True)
      
      df_total = pd.concat([df_total, df], axis=0)
    
    df_total.to_csv('/content/total_case_train.csv')

    print('DONE')


Overwriting lettuce_preprocessing.py


In [6]:
import lettuce_preprocessing as lp

In [7]:
lp = lp.lettuce_train_set_preprocessing()

In [8]:
lp.merge_X_y(path_train_input, path_train_target, path_save='/content/DATA')

0it [00:00, ?it/s]

DONE


In [9]:
lp.concat_case_data(path_save='/content/DATA')

DONE
