# AWS상 MySQL에 업데이트 | pyarrow | parquet

---
### 1. 패키지 import 및 기본 사항 확인

In [1]:
# 기본 패키지
import pandas as pd
import numpy as np
import pyarrow

# 디렉토리 관련 패키지
import os
import glob
import natsort

# MySQL 관련 패키지
import MySQLdb
import mysql.connector

# SQL Alchemy 관련 패키지 1
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.dialects.mysql import *
from sqlalchemy.types import *

# SQL Alchemy 관련 패키지 2
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Table, MetaData
from sqlalchemy import insert, update

---
### 2. 환경변수 CSV 처리

In [2]:
# 1. 폴더에 있는 파일들 중 csv 형식의 파일만 보여주기
# natsort.natsorted() : 파일을 순서대로 정렬하기 위해 사용
# 파일들을 불러오기 위한 기본 경로
var_path = r'C:\Users\admin\Desktop\FinalProject\chromate\chromate_data\variable\\'
varfilelist = natsort.natsorted(os.listdir(var_path))
varfilelist = [file for file in varfilelist if file.endswith('.csv')]
varfilelist

['kemp-abh-sensor-2021.09.06.csv',
 'kemp-abh-sensor-2021.09.07.csv',
 'kemp-abh-sensor-2021.09.08.csv',
 'kemp-abh-sensor-2021.09.09.csv',
 'kemp-abh-sensor-2021.09.10.csv',
 'kemp-abh-sensor-2021.09.13.csv',
 'kemp-abh-sensor-2021.09.14.csv',
 'kemp-abh-sensor-2021.09.15.csv',
 'kemp-abh-sensor-2021.09.16.csv',
 'kemp-abh-sensor-2021.09.17.csv',
 'kemp-abh-sensor-2021.09.23.csv',
 'kemp-abh-sensor-2021.09.24.csv',
 'kemp-abh-sensor-2021.09.27.csv',
 'kemp-abh-sensor-2021.09.28.csv',
 'kemp-abh-sensor-2021.09.29.csv',
 'kemp-abh-sensor-2021.09.30.csv',
 'kemp-abh-sensor-2021.10.01.csv',
 'kemp-abh-sensor-2021.10.05.csv',
 'kemp-abh-sensor-2021.10.06.csv',
 'kemp-abh-sensor-2021.10.07.csv',
 'kemp-abh-sensor-2021.10.08.csv',
 'kemp-abh-sensor-2021.10.12.csv',
 'kemp-abh-sensor-2021.10.13.csv',
 'kemp-abh-sensor-2021.10.14.csv',
 'kemp-abh-sensor-2021.10.15.csv',
 'kemp-abh-sensor-2021.10.18.csv',
 'kemp-abh-sensor-2021.10.19.csv',
 'kemp-abh-sensor-2021.10.20.csv',
 'kemp-abh-sensor-20

In [3]:
# 2. 조합해서 반복문으로 넣어줄 수 있게 코드 짜기
# 2-1. df들을 집어넣을 수 있게 빈 데이터프레임 틀 만들어주기
var_df = pd.DataFrame(columns=['Index', 'Date', 'Time', 'Lot', 'pH', 'Temp', 'Voltage'])
var_df

Unnamed: 0,Index,Date,Time,Lot,pH,Temp,Voltage


In [4]:
for file in varfilelist:
    # 2-2. csv의 데이터들을 dataframe으로 불러오기
    df = pd.read_csv(var_path + file, engine='pyarrow')

    # 2-3. Date 컬럼 생성
    date = file.split('-')[-1].replace('.','-')
    date = date.removesuffix('-csv')
    df['Date'] = date

    # 2-4. Time을 HH:MM:SS 형태로 조정
    adj_time = list()
    for time in df['Time']:
        tmp = time.split(':')
        # 2-4-1. 오전·오후 문자를 제거하고 오후일 경우 +12를 해준다
        if tmp[0].split(' ')[0] == '오후':
            tmp[0] = str(int(tmp[0].split(' ')[-1]) + 12)
        else:
            tmp[0] = tmp[0].split(' ')[-1]
        # 2-4-2. HH:MM:SS 형태로 합쳐주고 소수점 뒷부분은 제거한다
        tmp = ':'.join(tmp).split('.')[0]
        adj_time.append(tmp)
    # 2-4-3. Time 컬럼을 조정된 형태로 바꾼다
    df['Time'] = adj_time

    # 2-5. 컬럼 순서 조정
    df = df[['Index', 'Date', 'Time', 'Lot', 'pH', 'Temp', 'Voltage']]

    # 2-6. 각각의 파일들에서 나온 dataframe 전부 합치기
    var_df = pd.concat([var_df, df], axis=0)

# 2-7. 인덱스 정리 및 사용하지 않는 Index 컬럼 제거
var_df.reset_index(drop=True, inplace=True)
var_df.drop(columns=['Index'], inplace=True)
var_df

Unnamed: 0,Date,Time,Lot,pH,Temp,Voltage
0,2021-09-06,16:29:54,1,2.15,43.15,19.74
1,2021-09-06,16:29:59,1,2.08,40.13,18.01
2,2021-09-06,16:30:04,1,2.18,43.46,18.73
3,2021-09-06,16:30:09,1,1.99,41.72,16.75
4,2021-09-06,16:30:14,1,1.85,43.65,18.02
...,...,...,...,...,...,...
50089,2021-10-27,18:36:03,22,2.05,42.84,15.38
50090,2021-10-27,18:36:08,22,1.91,42.64,19.08
50091,2021-10-27,18:36:13,22,2.11,44.09,18.14
50092,2021-10-27,18:36:18,22,1.92,43.95,17.96


#### 2-1. MySQL 올리기 전 확인사항

In [5]:
# 1. 데이터들이 어떤 타입으로 저장되어 있는지 확인 필요
var_df.dtypes

Date        object
Time        object
Lot         object
pH         float64
Temp       float64
Voltage    float64
dtype: object

In [6]:
# Lot, pH, Temp, Voltage를 수치타입으로 변경하고 확인
var_df['Lot'] = pd.to_numeric(var_df['Lot'])
var_df['pH'] = pd.to_numeric(var_df['pH'])
var_df['Temp'] = pd.to_numeric(var_df['Temp'])
var_df['Voltage'] = pd.to_numeric(var_df['Voltage'])
var_df.dtypes

Date        object
Time        object
Lot          int64
pH         float64
Temp       float64
Voltage    float64
dtype: object

In [10]:
# 2. pH, Temp, Voltage 컬럼의 소수점자리가 몇 자리까지인지 확인
# pH 자리수 확인 -> max가 2이므로 모든 데이터가 소수점 두 자리까지인 것을 확인하였다
var_df['pH'] = var_df['pH'].astype(str)
ph_lenlist = []
for rows in var_df['pH']:
    a = len(rows.split('.')[-1])   # 소숫점 자리만 확인하기 위해서 '.'로 구분하여 뒷부분만 사용
    ph_lenlist.append(a)
print(max(ph_lenlist))

2


In [11]:
# Temp 자리수 확인 -> max가 2이므로 모든 데이터가 소수점 두 자리까지인 것을 확인하였다
var_df['Temp'] = var_df['Temp'].astype(str)
temp_lenlist = []
for rows in var_df['Temp']:
    a = len(rows.split('.')[-1])   # 소숫점 자리만 확인하기 위해서 '.'로 구분하여 뒷부분만 사용
    temp_lenlist.append(a)
print(max(temp_lenlist))

2


In [12]:
# Voltage 자리수 확인 -> max가 2이므로 모든 데이터가 소수점 두 자리까지인 것을 확인하였다
var_df['Voltage'] = var_df['Voltage'].astype(str)
vol_lenlist = []
for rows in var_df['Voltage']:
    a = len(rows.split('.')[-1])   # 소숫점 자리만 확인하기 위해서 '.'로 구분하여 뒷부분만 사용
    vol_lenlist.append(a)
print(max(vol_lenlist))

2


#### 2-2. MySQL 'variable' 테이블로 올리기

In [7]:
# 1. sixdogma 서버에 연결하기
# 형식: 'mysql://{0}:{1}@{2}:{3}/{4}'.format(user, pass, host, port, db)
url    = 'mysql://sixdogma:Poiu0987*@13.113.12.130:3306/Anay'
engine = sqlalchemy.create_engine(url, encoding='utf-8', echo=True)
conn   = engine.connect()

2022-11-09 11:13:53,820 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-11-09 11:13:53,821 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:13:53,927 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2022-11-09 11:13:53,928 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:13:53,999 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2022-11-09 11:13:54,000 INFO sqlalchemy.engine.Engine [raw sql] ()


In [15]:
# 2. 새로운 데이터베이스(SCHEMA) 만들기 : 데이터베이스 이름을 chromate_raw로 만들었습니다
# 이미 만들었으므로 주석처리
# engine.execute("CREATE DATABASE Anay")

2022-10-25 10:30:33,724 INFO sqlalchemy.engine.Engine CREATE DATABASE Anay
2022-10-25 10:30:33,724 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-10-25 10:30:33,769 INFO sqlalchemy.engine.Engine COMMIT


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x2229ebc7640>

In [22]:
# 3. 만든 데이터베이스를 실행 : 1에서 DB 들어갈 수 있게 설정해뒀습니다
# engine.execute("USE Anay")

2022-10-25 16:17:06,649 INFO sqlalchemy.engine.Engine USE Anay
2022-10-25 16:17:06,650 INFO sqlalchemy.engine.Engine [raw sql] ()


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1ac13cddb40>

In [8]:
# 4. 'variable' 테이블 만들기
engine.execute("CREATE TABLE variable ( var_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, Date DATE NOT NULL, Time TIME NOT NULL, Lot INT NOT NULL, pH FLOAT(3,2) NOT NULL, Temp FLOAT(4,2) NOT NULL, Voltage FLOAT(4,2) NOT NULL )")

2022-11-09 11:14:02,381 INFO sqlalchemy.engine.Engine CREATE TABLE variable ( var_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, Date DATE NOT NULL, Time TIME NOT NULL, Lot INT NOT NULL, pH FLOAT(3,2) NOT NULL, Temp FLOAT(4,2) NOT NULL, Voltage FLOAT(4,2) NOT NULL )
2022-11-09 11:14:02,381 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:14:02,463 INFO sqlalchemy.engine.Engine COMMIT


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1d7d858a590>

In [9]:
# 5. 데이터프레임에 있는 데이터 insert하기
var_type = {'Date':sqlalchemy.types.DATE(),
            'Time':sqlalchemy.types.TIME(),
            'Lot':sqlalchemy.types.INT(),
            'pH':sqlalchemy.types.FLOAT(),
            'Temp':sqlalchemy.types.FLOAT(),
            'Voltage':sqlalchemy.types.FLOAT()
}
var_df.to_sql(name='variable', con=engine, if_exists='append', index=False, dtype=var_type)

2022-11-09 11:14:22,934 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-11-09 11:14:22,935 INFO sqlalchemy.engine.Engine [generated in 0.00109s] ('Anay', 'variable')
2022-11-09 11:14:23,033 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-11-09 11:14:23,195 INFO sqlalchemy.engine.Engine INSERT INTO variable (`Date`, `Time`, `Lot`, `pH`, `Temp`, `Voltage`) VALUES (%s, %s, %s, %s, %s, %s)
2022-11-09 11:14:23,196 INFO sqlalchemy.engine.Engine [generated in 0.12757s] (('2021-09-06', '16:29:54', 1, 2.15, 43.15, 19.74), ('2021-09-06', '16:29:59', 1, 2.08, 40.13, 18.01), ('2021-09-06', '16:30:04', 1, 2.18, 43.46, 18.73), ('2021-09-06', '16:30:09', 1, 1.99, 41.72, 16.75), ('2021-09-06', '16:30:14', 1, 1.85, 43.65, 18.02), ('2021-09-06', '16:30:19', 1, 1.94, 42.82, 19.27), ('2021-09-06', '16:30:24', 1, 1.94, 43.17, 17.4), ('2021-09-06', '16:30:29', 1, 2.06, 44.16, 18.69)  ... displaying 10 of 50094 total bound para

50094

In [30]:
# 6. SQL 상에 ID 컬럼을 추가
# MySQL 상에서 data type에 pk를 체크해주었다. → 이 과정 필요없어서 주석처리
# engine.execute("ALTER TABLE variable ADD var_id INT NOT NULL AUTO_INCREMENT, ADD INDEX (var_id);")

---
### 3. Error Lot list CSV 처리

In [10]:
# 1. 폴더에 있는 파일들 중 csv 형식의 파일만 보여주기
# natsort.natsorted() : 파일을 순서대로 정렬하기 위해 사용
# 파일들을 불러오기 위한 기본 경로
err_path = r'C:\Users\admin\Desktop\FinalProject\chromate\chromate_data\error\\'
errfilelist = natsort.natsorted(os.listdir(err_path))
errfilelist = [file for file in errfilelist if file.endswith('.csv')]
errfilelist

['Error Lot list.csv']

In [11]:
# 2. 조합해서 반복문으로 넣어줄 수 있게 코드 짜기
# 2-1. df들을 집어넣을 수 있게 빈 데이터프레임 틀 만들어주기
err_df = pd.DataFrame(columns=['Date', 'FailureLot1', 'FailureLot2'])
err_df

Unnamed: 0,Date,FailureLot1,FailureLot2


In [12]:
for file in errfilelist:
    # 2-2. csv의 데이터들을 dataframe으로 불러오기
    df = pd.read_csv(err_path + file, engine='pyarrow')
    # 2-3. 컬럼 이름 조정 필요
    df.rename(columns = {'0':'Date', '1':'FailureLot1', '2':'FailureLot2'}, inplace=True)

    # 2-4. FailureLot1, FailureLot2 INT로 바꿔주기
    # 결측치가 있으므로 0으로 채워주고 바꾼다 → NULL을 그대로 놔두고 싶으므로 주석처리
    # df['FailureLot1'] = df['FailureLot1'].fillna(0).astype(int)
    # df['FailureLot2'] = df['FailureLot2'].fillna(0).astype(int)

    # 2-5. 다시 결측치로 바꿔주기
    # np.NaN을 써버리면 그 열 전체가 float로 바뀌므로 일단 주석처리
    # df['FailureLot1'] = df['FailureLot1'].replace(0, np.NaN)
    # df['FailureLot2'] = df['FailureLot2'].replace(0, np.NaN)

    # 2-6. 각각의 파일들에서 나온 dataframe 전부 합치기
    err_df = pd.concat([err_df, df], axis=0)

err_df

Unnamed: 0,Date,FailureLot1,FailureLot2
0,2021-09-06,,
1,2021-09-07,,
2,2021-09-08,20.0,
3,2021-09-09,16.0,5.0
4,2021-09-10,,
5,2021-09-13,,
6,2021-09-14,,
7,2021-09-15,,
8,2021-09-16,4.0,
9,2021-09-17,,


#### 3-1. MySQL 올리기 전 확인사항

In [16]:
# 1. 데이터가 어떤 타입으로 저장되어 있는지 확인 필요
err_df.dtypes

Date            object
FailureLot1    float64
FailureLot2    float64
dtype: object

In [13]:
# FailureLot1, FailureLot2를 수치타입으로 변경
err_df['FailureLot1'] = pd.to_numeric(err_df['FailureLot1'])
err_df['FailureLot2'] = pd.to_numeric(err_df['FailureLot2'])
err_df.dtypes

Date            object
FailureLot1    float64
FailureLot2    float64
dtype: object

#### 3-2. MySQL 'error' 테이블로 올리기

In [14]:
# 1. 서버에 연결하기
# 형식: 'mysql://{0}:{1}@{2}:{3}/{4}'.format(user, pass, host, port, db)
url    = 'mysql://sixdogma:Poiu0987*@13.113.12.130:3306/Anay'
engine = sqlalchemy.create_engine(url, encoding='utf-8', echo=True)
conn   = engine.connect()

2022-11-09 11:14:49,432 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-11-09 11:14:49,433 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:14:49,526 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2022-11-09 11:14:49,527 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:14:49,575 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2022-11-09 11:14:49,577 INFO sqlalchemy.engine.Engine [raw sql] ()


In [39]:
# 2. 만든 데이터베이스를 실행
# 위에 써놨으므로 주석처리
# engine.execute("USE Anay")

In [15]:
# 3. MySQL에 'error' 테이블 만들기 → 테이블 만들고 집어넣으려니까 자꾸 에러가 나서, 일단 주석처리 하고 아래 코드로 실행
engine.execute("CREATE TABLE error ( err_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, Date DATE NOT NULL, FailureLot1 INT, FailureLot2 INT )")

2022-11-09 11:14:52,918 INFO sqlalchemy.engine.Engine CREATE TABLE error ( err_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, Date DATE NOT NULL, FailureLot1 INT, FailureLot2 INT )
2022-11-09 11:14:52,920 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:14:53,077 INFO sqlalchemy.engine.Engine COMMIT


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1d7d85d7400>

In [16]:
# 4. MySQL 'error' 테이블에 데이터들을 insert 하기
err_type = {'Date'    : sqlalchemy.types.DATE(),
            'FailureLot1' : sqlalchemy.types.INT(),
            'FailureLot2' : sqlalchemy.types.INT()
}
err_df.to_sql(name='error', con=engine, if_exists='append', index=False, dtype=err_type)

2022-11-09 11:15:18,988 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-11-09 11:15:18,989 INFO sqlalchemy.engine.Engine [generated in 0.00079s] ('Anay', 'error')
2022-11-09 11:15:19,090 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-11-09 11:15:19,092 INFO sqlalchemy.engine.Engine INSERT INTO error (`Date`, `FailureLot1`, `FailureLot2`) VALUES (%s, %s, %s)
2022-11-09 11:15:19,093 INFO sqlalchemy.engine.Engine [generated in 0.00110s] ((datetime.date(2021, 9, 6), None, None), (datetime.date(2021, 9, 7), None, None), (datetime.date(2021, 9, 8), 20.0, None), (datetime.date(2021, 9, 9), 16.0, 5.0), (datetime.date(2021, 9, 10), None, None), (datetime.date(2021, 9, 13), None, None), (datetime.date(2021, 9, 14), None, None), (datetime.date(2021, 9, 15), None, None)  ... displaying 10 of 33 total bound parameter sets ...  (datetime.date(2021, 10, 26), None, None), (datetime.date(2021, 10, 27), None, None))
2022-

33

In [61]:
# 6. SQL 상에 ID 컬럼을 추가
# MySQL 상에서 data type에 pk를 체크해주었다. → 이 과정 필요없어서 주석처리
# engine.execute("ALTER TABLE error ADD err_id INT NOT NULL AUTO_INCREMENT, ADD INDEX (err_id);")

---
### 4. 모델러가 원하는 input 만들기

#### 4-1. err_df 특정 날짜의 에러 Lot번호 → var_df의 특정 날짜의 에러 Lot번호들에 표시

In [17]:
# 확인용 셀
working_df = var_df.copy()
working_df

Unnamed: 0,Date,Time,Lot,pH,Temp,Voltage
0,2021-09-06,16:29:54,1,2.15,43.15,19.74
1,2021-09-06,16:29:59,1,2.08,40.13,18.01
2,2021-09-06,16:30:04,1,2.18,43.46,18.73
3,2021-09-06,16:30:09,1,1.99,41.72,16.75
4,2021-09-06,16:30:14,1,1.85,43.65,18.02
...,...,...,...,...,...,...
50089,2021-10-27,18:36:03,22,2.05,42.84,15.38
50090,2021-10-27,18:36:08,22,1.91,42.64,19.08
50091,2021-10-27,18:36:13,22,2.11,44.09,18.14
50092,2021-10-27,18:36:18,22,1.92,43.95,17.96


In [18]:
# 확인용 셀
working_df.dtypes

Date        object
Time        object
Lot          int64
pH         float64
Temp       float64
Voltage    float64
dtype: object

In [19]:
# 확인용 셀
err_df

Unnamed: 0,Date,FailureLot1,FailureLot2
0,2021-09-06,,
1,2021-09-07,,
2,2021-09-08,20.0,
3,2021-09-09,16.0,5.0
4,2021-09-10,,
5,2021-09-13,,
6,2021-09-14,,
7,2021-09-15,,
8,2021-09-16,4.0,
9,2021-09-17,,


In [21]:
# 확인용 셀
err_df.dtypes

Date            object
FailureLot1    float64
FailureLot2    float64
dtype: object

In [22]:
# 확인용 셀 → 컬럼이 갑자기 이상하게 변함
# datetime.date(2021, 9, 8) 이런식으로 나옴
err_df['Date'][2]

datetime.date(2021, 9, 8)

In [20]:
# 컬럼 고치기
import time
import datetime
from time import strftime

In [23]:
# 컬럼 고치기
err_df['Date'] = err_df['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))
err_df

Unnamed: 0,Date,FailureLot1,FailureLot2
0,2021-09-06,,
1,2021-09-07,,
2,2021-09-08,20.0,
3,2021-09-09,16.0,5.0
4,2021-09-10,,
5,2021-09-13,,
6,2021-09-14,,
7,2021-09-15,,
8,2021-09-16,4.0,
9,2021-09-17,,


In [24]:
# 확인용 셀
working_df['Date'][10]

'2021-09-06'

In [25]:
# 확인용 셀
err_df['Date'][2]

'2021-09-08'

In [26]:
# 조건에 해당하는 열 추가하는 작업
sep = list()    # 정상 로트: 0 | 불량 로트: 1

for i in range(len(err_df)):
    for j in range(len(working_df)):
        if err_df['Date'][i] == working_df['Date'][j]:
            if err_df['FailureLot1'][i] == working_df['Lot'][j]:
                sep.append(1)
            elif err_df['FailureLot2'][i] == working_df['Lot'][j]:
                sep.append(1)
            else:
                sep.append(0)

working_df['sep'] = sep

In [27]:
working_df

Unnamed: 0,Date,Time,Lot,pH,Temp,Voltage,sep
0,2021-09-06,16:29:54,1,2.15,43.15,19.74,0
1,2021-09-06,16:29:59,1,2.08,40.13,18.01,0
2,2021-09-06,16:30:04,1,2.18,43.46,18.73,0
3,2021-09-06,16:30:09,1,1.99,41.72,16.75,0
4,2021-09-06,16:30:14,1,1.85,43.65,18.02,0
...,...,...,...,...,...,...,...
50089,2021-10-27,18:36:03,22,2.05,42.84,15.38,0
50090,2021-10-27,18:36:08,22,1.91,42.64,19.08,0
50091,2021-10-27,18:36:13,22,2.11,44.09,18.14,0
50092,2021-10-27,18:36:18,22,1.92,43.95,17.96,0


In [28]:
# 확인용 셀
working_df[(working_df['Date'] == '2021-09-08') & (working_df['Lot'] == 20)]

Unnamed: 0,Date,Time,Lot,pH,Temp,Voltage,sep
4347,2021-09-08,18:23:01,20,1.97,41.20,15.53,1
4348,2021-09-08,18:23:06,20,2.08,42.13,16.86,1
4349,2021-09-08,18:23:11,20,1.89,44.62,18.19,1
4350,2021-09-08,18:23:16,20,1.95,42.55,15.59,1
4351,2021-09-08,18:23:21,20,2.43,44.54,15.44,1
...,...,...,...,...,...,...,...
4411,2021-09-08,18:28:22,20,1.84,43.90,18.44,1
4412,2021-09-08,18:28:27,20,2.02,42.84,17.54,1
4413,2021-09-08,18:28:32,20,2.04,40.26,16.71,1
4414,2021-09-08,18:28:37,20,1.92,35.46,15.49,1


In [30]:
# 확인용 셀
working_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50094 entries, 0 to 50093
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     50094 non-null  object
 1   Time     50094 non-null  object
 2   Lot      50094 non-null  int64 
 3   pH       50094 non-null  object
 4   Temp     50094 non-null  object
 5   Voltage  50094 non-null  object
 6   sep      50094 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.7+ MB


#### 4-2. 작업 내용 저장

In [29]:
# 작업한 데이터프레임을 csv 형식으로 저장하기
working_df.to_csv(r'C:\Users\admin\Desktop\FinalProject\chromate\working_df.csv')

In [30]:
# 작업한 데이터프레임을 parquet 형식으로 저장하기
working_df.to_parquet(r'C:\Users\admin\Desktop\FinalProject\chromate\working_df.parquet')

In [32]:
# 작업한 데이터프레임을 SQL에 저장하기
# 1. 서버에 연결하기
# 형식: 'mysql://{0}:{1}@{2}:{3}/{4}'.format(user, pass, host, port, db)
url    = 'mysql://sixdogma:Poiu0987*@13.113.12.130:3306/Model'
engine = sqlalchemy.create_engine(url, encoding='utf-8', echo=True)
conn   = engine.connect()

2022-11-09 11:17:00,943 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-11-09 11:17:00,943 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:17:01,029 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2022-11-09 11:17:01,029 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:17:01,080 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2022-11-09 11:17:01,081 INFO sqlalchemy.engine.Engine [raw sql] ()


In [109]:
# 2. 새로운 데이터베이스(SCHEMA) 만들기 : 데이터베이스 이름을 Model로 만들었습니다
# 이미 만들었으므로 주석처리
# engine.execute("CREATE DATABASE Model")

2022-10-25 17:16:45,573 INFO sqlalchemy.engine.Engine CREATE DATABASE Model
2022-10-25 17:16:45,574 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-10-25 17:16:45,628 INFO sqlalchemy.engine.Engine COMMIT


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1ac174027a0>

In [126]:
# 3. 만든 데이터베이스를 실행
# 위에서 실행하게 했으므로 주석처리
# engine.execute("USE Model")

2022-10-25 17:27:44,460 INFO sqlalchemy.engine.Engine USE Model
2022-10-25 17:27:44,461 INFO sqlalchemy.engine.Engine [raw sql] ()


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1ac1b6c97b0>

In [33]:
# 4. MySQL에 'model_var' 테이블 만들기
engine.execute("CREATE TABLE model_var ( modelvar_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, Date DATE NOT NULL, Time TIME NOT NULL, Lot INT NOT NULL, pH FLOAT(3,2) NOT NULL, Temp FLOAT(4,2) NOT NULL, Voltage FLOAT(4,2) NOT NULL, sep INT NOT NULL )")

2022-11-09 11:17:06,848 INFO sqlalchemy.engine.Engine CREATE TABLE model_var ( modelvar_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, Date DATE NOT NULL, Time TIME NOT NULL, Lot INT NOT NULL, pH FLOAT(3,2) NOT NULL, Temp FLOAT(4,2) NOT NULL, Voltage FLOAT(4,2) NOT NULL, sep INT NOT NULL )
2022-11-09 11:17:06,849 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-11-09 11:17:06,984 INFO sqlalchemy.engine.Engine COMMIT


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1d7d85d63e0>

In [34]:
# 5. DataFrame들 df.to_sql() 로 올리기
working_type = {'Date':sqlalchemy.types.DATE(),
                'Time':sqlalchemy.types.TIME(),
                'Lot':sqlalchemy.types.INT(),
                'pH':sqlalchemy.types.FLOAT(),
                'Temp':sqlalchemy.types.FLOAT(),
                'Voltage':sqlalchemy.types.FLOAT(),
                'sep':sqlalchemy.types.INT()
}

working_df.to_sql(name='model_var', con=engine, if_exists='append', index=False, dtype=working_type)

2022-11-09 11:17:10,087 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s
2022-11-09 11:17:10,088 INFO sqlalchemy.engine.Engine [generated in 0.00099s] ('Model', 'model_var')
2022-11-09 11:17:10,247 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-11-09 11:17:10,414 INFO sqlalchemy.engine.Engine INSERT INTO model_var (`Date`, `Time`, `Lot`, `pH`, `Temp`, `Voltage`, sep) VALUES (%s, %s, %s, %s, %s, %s, %s)
2022-11-09 11:17:10,415 INFO sqlalchemy.engine.Engine [generated in 0.13121s] (('2021-09-06', '16:29:54', 1, 2.15, 43.15, 19.74, 0), ('2021-09-06', '16:29:59', 1, 2.08, 40.13, 18.01, 0), ('2021-09-06', '16:30:04', 1, 2.18, 43.46, 18.73, 0), ('2021-09-06', '16:30:09', 1, 1.99, 41.72, 16.75, 0), ('2021-09-06', '16:30:14', 1, 1.85, 43.65, 18.02, 0), ('2021-09-06', '16:30:19', 1, 1.94, 42.82, 19.27, 0), ('2021-09-06', '16:30:24', 1, 1.94, 43.17, 17.4, 0), ('2021-09-06', '16:30:29', 1, 2.06, 44.16, 18.69, 0)  ... dis

50094

In [146]:
# 6. SQL 상에 ID 컬럼을 추가
# MySQL 상에서 data type에 pk를 체크해주었다. → 이 과정 필요없어서 주석처리
# engine.execute("ALTER TABLE var_model ADD varmodel_id INT NOT NULL AUTO_INCREMENT, ADD INDEX (varmodel_id);")