In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## EDA

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 데이터 로드 및 전처리
file_path = 'data.xlsx'
df = pd.read_excel(file_path, sheet_name='result_6')

# 컬럼명 재정의 및 필요 없는 행 제거
df.columns = [
    'DateTime', 'Power_Unit3', 'OutletSox_Unit3', 'Compliance_Unit3', 'Power_Unit4', 'OutletSox_Unit4',
    'Compliance_Unit4', 'Power_Unit5', 'OutletSox_Unit5', 'Compliance_Unit5', 'Power_Unit6', 'OutletSox_Unit6',
    'Compliance_Unit6', 'InletSox_Unit3', 'InletSox_Unit4', 'InletSox_Unit5', 'InletSox_Unit6',
    'Limestone_Unit3', 'Limestone_Unit4', 'Limestone_Unit5', 'Limestone_Unit6'
]
df = df.drop(0)  # 첫 번째 행 제거
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')  # 날짜 형식 변환
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')  # 나머지 열을 숫자로 변환

# NaN 값을 평균값으로 대체
df = df.fillna(df.mean())

# 'SoxDiff_Unit3' 열 생성
df['SoxDiff_Unit3'] = df['InletSox_Unit3'] - df['OutletSox_Unit3']

# 'SoxDiff_Unit3' 값이 음수인 행 제거
df = df[df['SoxDiff_Unit3'] >= 0]

In [12]:
# SoxDiff_Unit3의 통계량 계산
sox_diff_stats = df['SoxDiff_Unit3'].describe()
print(sox_diff_stats)

# 표준편차 계산
std_sox_diff = sox_diff_stats['std']

# threshold 설정 (예: 표준편차의 1%)
threshold = std_sox_diff * 0.05
print(f"Calculated threshold: {threshold}")

# Limestone 사용량의 통계량 계산
limestone_stats = df['Limestone_Unit3'].describe()
print(limestone_stats)

# 표준편차 계산
std_limestone = limestone_stats['std']

# 증가폭 설정 (예: 표준편차의 1%)
increment = std_limestone * 0.01
print(f"Calculated increment: {increment}")

count    49591.000000
mean       204.926885
std         54.463480
min          0.000000
25%        175.780789
50%        200.166190
75%        240.196111
max        396.452308
Name: SoxDiff_Unit3, dtype: float64
Calculated threshold: 2.7231739792085605
count    49591.000000
mean         8.046738
std          5.734882
min         -0.286702
25%          5.130902
50%          6.677245
75%          9.503023
max         72.261812
Name: Limestone_Unit3, dtype: float64
Calculated increment: 0.05734882092225625


In [13]:
# Inlet Sox 구간 설정 및 평균 석회석 사용량 계산
bins = [0, 100, 150, 200, 250, 300, float('inf')]
labels = ['0-100', '100-150', '150-200', '200-250', '250-300', '300+']
df['InletSox_Range'] = pd.cut(df['InletSox_Unit3'], bins=bins, labels=labels)

# 구간별 평균 석회석 사용량 계산
avg_limestone_usage = df.groupby('InletSox_Range')['Limestone_Unit3'].mean().reset_index()

# 결과 출력
print("Inlet Sox 구간\t평균 석회석 사용량 (ton)")
for _, row in avg_limestone_usage.iterrows():
    print(f"{row['InletSox_Range']}\t{row['Limestone_Unit3']:.6f}")

Inlet Sox 구간	평균 석회석 사용량 (ton)
0-100	1.783524
100-150	4.996903
150-200	6.950159
200-250	8.337030
250-300	10.267330
300+	14.374816


## GBR 모델

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 특징 변수와 목표 변수 정의
X_soxdiff = df[['Power_Unit3', 'InletSox_Unit3', 'Limestone_Unit3']]
y_soxdiff = df['SoxDiff_Unit3']

# 데이터 분할
X_train_soxdiff, X_test_soxdiff, y_train_soxdiff, y_test_soxdiff = train_test_split(X_soxdiff, y_soxdiff, test_size=0.2, random_state=42)

# 최적 하이퍼파라미터로 모델 정의 및 훈련
model_soxdiff = GradientBoostingRegressor(
    random_state=42,
    learning_rate=0.1,
    max_depth=3,
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=100
)
model_soxdiff.fit(X_train_soxdiff, y_train_soxdiff)

# 모델 예측
y_pred_soxdiff = model_soxdiff.predict(X_test_soxdiff)

# 모델 평가
mse_soxdiff = mean_squared_error(y_test_soxdiff, y_pred_soxdiff)
r2_soxdiff = r2_score(y_test_soxdiff, y_pred_soxdiff)

print(f'MSE: {mse_soxdiff}')
print(f'R2: {r2_soxdiff}')


MSE: 73.756663438496
R2: 0.9744931022955279


## 하이퍼 파라미터 튜닝(GridSearchCV)

In [19]:
# from sklearn.model_selection import GridSearchCV

# # 최적 학습 데이터 크기로 데이터 분할
# optimal_X_train, _, optimal_y_train, _ = train_test_split(
#     X_soxdiff, y_soxdiff, train_size=optimal_train_size, random_state=42
# )

# # 하이퍼파라미터 튜닝
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search = GridSearchCV(
#     estimator=GradientBoostingRegressor(random_state=42),
#     param_grid=param_grid,
#     cv=5,
#     scoring='r2',
#     n_jobs=-1
# )

# grid_search.fit(optimal_X_train, optimal_y_train)

# # 최적 하이퍼파라미터 출력
# print(f'Best Parameters: {grid_search.best_params_}')
# print(f'Best Cross-Validation R2 Score: {grid_search.best_score_}')

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best Cross-Validation R2 Score: 0.9695203451723536


## inletsox 구간 별 limestone 평균값 아래 값 도출 함수

In [21]:
# Define a function to get the initial limestone based on inlet_sox
def get_initial_limestone(inlet_sox):
    if inlet_sox <= 100:
        return 1.83524-0.5
    elif inlet_sox <= 150:
        return 4.996510-0.5
    elif inlet_sox <= 200:
        return 6.843625-0.5
    elif inlet_sox <= 250:
        return 8.337030-0.5
    elif inlet_sox <= 300:
        return 10.207591-0.5
    else:
        return 14.374816-0.5

# Calculate the average power for Unit 3
average_power_unit3 = df['Power_Unit3'].mean()

## limestone 최적량 도출 함수

In [22]:
def find_optimal_limestone_soxdiff(model, inlet_sox, power_unit3=None, threshold=0.01, increment=0.1, max_iterations=1000):
    if power_unit3 is None:
        power_unit3 = average_power_unit3
    initial_limestone = get_initial_limestone(inlet_sox)
    current_limestone = initial_limestone
    previous_sox_diff = 0
    consecutive_count = 0  # 연속으로 조건을 만족한 횟수
    first_optimal_limestone = None  # 처음으로 조건을 만족한 limestone 값

    for iteration in range(max_iterations):
        current_limestone += increment
        data = pd.DataFrame({
            'Power_Unit3': [power_unit3],
            'InletSox_Unit3': [inlet_sox],
            'Limestone_Unit3': [current_limestone]
        })
        current_sox_diff = model.predict(data)[0]

        # 디버깅을 위한 출력
        print(f"Iteration: {iteration}, Limestone: {current_limestone}, SoxDiff: {current_sox_diff}, Previous SoxDiff: {previous_sox_diff}")

        if abs(current_sox_diff - previous_sox_diff) < threshold:
            consecutive_count += 1
            if consecutive_count == 1:
                first_optimal_limestone = current_limestone
            if consecutive_count >= 3:
                break
        else:
            consecutive_count = 0

        # SoxDiff가 감소하면 중단하고 직전의 limestone 값을 최적의 값으로 저장
        if current_sox_diff < previous_sox_diff:
            return previous_limestone, previous_sox_diff

        previous_sox_diff = current_sox_diff
        previous_limestone = current_limestone

    return first_optimal_limestone, current_sox_diff

In [26]:
inlet_sox = 200 # This can be input by the user
optimal_limestone, final_sox_diff = find_optimal_limestone_soxdiff(model_soxdiff, inlet_sox)

print(f"Optimal Limestone: {optimal_limestone}")
print(f"Final SoxDiff: {final_sox_diff}")

Iteration: 0, Limestone: 6.443625, SoxDiff: 197.4168173205311, Previous SoxDiff: 0
Iteration: 1, Limestone: 6.543625, SoxDiff: 197.49874923752193, Previous SoxDiff: 197.4168173205311
Iteration: 2, Limestone: 6.643624999999999, SoxDiff: 197.57383441236135, Previous SoxDiff: 197.49874923752193
Iteration: 3, Limestone: 6.743624999999999, SoxDiff: 197.57383441236135, Previous SoxDiff: 197.57383441236135
Iteration: 4, Limestone: 6.8436249999999985, SoxDiff: 197.57383441236135, Previous SoxDiff: 197.57383441236135
Iteration: 5, Limestone: 6.943624999999998, SoxDiff: 197.57383441236135, Previous SoxDiff: 197.57383441236135
Optimal Limestone: 6.743624999999999
Final SoxDiff: 197.57383441236135


## inletSox 입력에 따른 석회석 최적치 출력 함수

In [32]:
# 사용자 입력을 통해 최적의 석회석 양을 반복적으로 찾는 함수
def find_optimal_limestone():
    while True:
        try:
            inlet_sox = float(input("입력하신 inlet SOx 값을 입력하세요 (0을 입력하면 종료됩니다): "))
            if inlet_sox == 0:
                print("프로그램을 종료합니다...")
                break

            optimal_limestone, sox_diff = find_optimal_limestone_soxdiff(model_soxdiff, inlet_sox)

            print(f"\nInlet SOx 값 {inlet_sox}에 대한 최적의 석회석 양: {optimal_limestone}")
            print(f"해당 SoxDiff: {sox_diff}\n")


        except ValueError:
            print("유효하지 않은 입력입니다. 숫자를 입력해 주세요.")

# 예제 사용법:
find_optimal_limestone()

입력하신 inlet SOx 값을 입력하세요 (0을 입력하면 종료됩니다): 300
Iteration: 0, Limestone: 9.807591, SoxDiff: 293.71063859337335, Previous SoxDiff: 0
Iteration: 1, Limestone: 9.907591, SoxDiff: 293.71063859337335, Previous SoxDiff: 293.71063859337335
Iteration: 2, Limestone: 10.007591, SoxDiff: 293.71063859337335, Previous SoxDiff: 293.71063859337335
Iteration: 3, Limestone: 10.107591, SoxDiff: 293.71063859337335, Previous SoxDiff: 293.71063859337335

Inlet SOx 값 300.0에 대한 최적의 석회석 양: 9.907591
해당 SoxDiff: 293.71063859337335

입력하신 inlet SOx 값을 입력하세요 (0을 입력하면 종료됩니다): 100
Iteration: 0, Limestone: 1.43524, SoxDiff: 94.77943411037741, Previous SoxDiff: 0
Iteration: 1, Limestone: 1.5352400000000002, SoxDiff: 94.77943411037741, Previous SoxDiff: 94.77943411037741
Iteration: 2, Limestone: 1.6352400000000002, SoxDiff: 94.77943411037741, Previous SoxDiff: 94.77943411037741
Iteration: 3, Limestone: 1.7352400000000003, SoxDiff: 94.77943411037741, Previous SoxDiff: 94.77943411037741

Inlet SOx 값 100.0에 대한 최적의 석회석 양: 1.5