In [1]:
import pandas as pd

# 노트북 상에서 그림을 바로 볼 수 있게 해주는 코드
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import optuna
import lightgbm as lgbm

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier, BaggingClassifier
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
from imblearn.over_sampling import SMOTE

warnings.filterwarnings(action='ignore')

In [2]:
train = pd.read_csv('./data/wine-quality-classification/train.csv')
test = pd.read_csv('./data/wine-quality-classification/test.csv')

In [3]:
train.isnull().sum()

index                   0
quality                 0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
type                    0
dtype: int64

In [4]:
test.isnull().sum()

index                   0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
type                    0
dtype: int64

In [5]:
enc = LabelEncoder()

In [6]:
enc.fit(train['type'])

LabelEncoder()

In [7]:
train['type'] = enc.transform(train['type'])
test['type'] = enc.transform(test['type'])

In [8]:
# get_dummies() : 서로 관계성을 갖지 않는 가변수 생성 역할(범주형 데이터 → 수치형 데이터)
# train = pd.get_dummies(train)
# test = pd.get_dummies(test)

In [9]:
# 불필요한 열 제거
train_x = train.drop(['index', 'quality'], axis=1)
train_y = train['quality']

test_x = test.drop(['index'], axis=1)

In [10]:
train_x

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,1
1,8.8,0.610,0.14,2.4,0.067,10.0,42.0,0.99690,3.19,0.59,9.5,0
2,7.9,0.210,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,1
3,7.0,0.210,0.31,6.0,0.046,29.0,108.0,0.99390,3.26,0.50,10.8,1
4,7.8,0.400,0.26,9.5,0.059,32.0,178.0,0.99550,3.04,0.43,10.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5492,7.7,0.150,0.29,1.3,0.029,10.0,64.0,0.99320,3.35,0.39,10.1,1
5493,6.3,0.180,0.36,1.2,0.034,26.0,111.0,0.99074,3.16,0.51,11.0,1
5494,7.8,0.150,0.34,1.1,0.035,31.0,93.0,0.99096,3.07,0.72,11.3,1
5495,6.6,0.410,0.31,1.6,0.042,18.0,101.0,0.99195,3.13,0.41,10.5,1


In [11]:
Kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [12]:
# lgbm_model = lgbm.LGBMClassifier()
# rnd_model = RandomForestClassifier()
# svm_model = SVC()

# model = VotingClassifier(
#     estimators=[('rf',rnd_model),('lgbm',lgbm_model)],
#     voting='hard'
# )

# train_x=MinMaxScaler().fit_transform(train_x)
# test_x=MinMaxScaler().fit_transform(test_x)

In [13]:
model = RandomForestClassifier(random_state=0)

val_scores = []
test_predictions = []

In [17]:
for train_idx, valid_idx in Kf.split(train_x,train_y) : 
    X_tr = train_x.iloc[train_idx]
    y_tr = train_y.iloc[train_idx]
    
    X_val = train_x.iloc[valid_idx]
    y_val = train_y.iloc[valid_idx]
    
    model.fit(X_tr, y_tr)
    
    test_pred = model.predict(test.drop('index', axis=1))
    
    test_predictions.append(test_pred)

In [18]:
# model.fit(train_x, train_y)

In [19]:
# y_pred = model.predict(test_x)

In [20]:
test_predictions = pd.DataFrame(test_predictions)

test_prediction = test_predictions.mode()
test_prediction = test_prediction.values[0]

In [21]:
submission = pd.read_csv('./data/wine-quality-classification/sample_submission.csv')

In [22]:
# submission['quality'] = y_pred

In [23]:
submission['quality'] = test_prediction

In [24]:
submission

Unnamed: 0,index,quality
0,0,5
1,1,5
2,2,6
3,3,5
4,4,6
...,...,...
995,995,6
996,996,6
997,997,5
998,998,6


In [25]:
submission.to_csv('submission_kfold.csv', index=False)