In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# import library

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# load data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv', encoding = 'utf-8')
test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv', encoding = 'utf-8')

train = pd.DataFrame(train)
test = pd.DataFrame(test)

train.head()

In [None]:
print(train.columns)
print(train.shape)

# data check

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
for column in train.columns:
    plt.figure()
    plt.hist(train[column], bins = 30, color = 'pink', edgecolor = 'black')
    plt.title(f"histogram of {column}")
    plt.xlabel("value")
    plt.ylabel("frequency")
    plt.show()

In [None]:
correlation_matrix = train.corr()

mask = np.triu(np.ones_like(correlation_matrix, dtype = bool))
# 하삼각을 가리려면 np.tril

plt.figure(figsize = (15, 15))

sns.heatmap(correlation_matrix, annot = True, cmap = "coolwarm", fmt = ".2f", mask = mask, vmin = -1, vmax = 1)
plt.title("correlation matrix")
plt.show()

# missing value, outlier

In [None]:
for columns in train.columns:
    plt.figure()
    train.boxplot(column = columns)
    plt.title(f"Boxplot of {columns}")
    plt.show()

In [None]:
def detect_outlier_iqr(column):
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    iqr = q3 - q1
    lower_boundary = q1 - iqr * 1.5
    upper_boundary = q3 + iqr * 1.5
    return sum((column < lower_boundary) | (column > upper_boundary))

In [None]:
outliersr_iqr = train.apply(detect_outlier_iqr)
print(outliersr_iqr)

In [None]:
def detect_outlier_zscore(column):
    threshold = 3
    mean = column.mean()
    std_dev = column.std()
    zscore = (column - mean) / std_dev
    return sum(abs(zscore) > threshold)

In [None]:
outliers_zscore = train.apply(detect_outlier_zscore)
print(outliers_zscore)

# data divide

In [None]:
train_x = train.drop(columns = ["FloodProbability", "id"])
train_y = train["FloodProbability"]

test_x = test.drop(columns = ['id'])

In [None]:
print(f"{train_x.shape}, {train_x.columns} \n {train_y.shape} \n {test_x.shape}")

# 최종 데이터 완성
- train_x
- train_y

- test_x

# Modeling

- Linear Regression

# import library

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size = 0.2, random_state = 2024)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

In [None]:
LR = LinearRegression()
LR.fit(x_train, y_train)

y_pred = LR.predict(x_val)

In [None]:
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"MSE: {mse}, R2 score: {r2}")

In [None]:
y_test_pred = LR.predict(test_x)
result_df = pd.DataFrame({'id':test['id'], 'FloodProbability':y_test_pred})
result_df

In [None]:
result_df.to_csv("../result/submission.csv", index = False)