# Model klasyfikacji

## 1. Import libraries and data

In [1]:
# Import libraries

# Basic
import numpy as np
import pandas as pd
import datetime
from pandas_profiling import ProfileReport
from imblearn.under_sampling import RandomUnderSampler

# Models and metrics
from sklearn.metrics import accuracy_score, precision_score
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Notebook settings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
# Load data
df = pd.read_csv('IPA.csv')
df

Unnamed: 0,IsIPA,OG,FG,ABV,IBU,Color,BoilTime,BoilGravity,PitchRate,Efficiency,UserId
0,False,1.069,1.007,8.12,0.00,30.48,60,,,75.0,
1,False,1.064,1.012,6.80,9.36,9.85,60,1.132,0.50,35.0,
2,False,1.061,1.015,6.08,28.31,35.83,60,1.044,0.35,83.0,42087.0
3,False,1.053,1.012,5.44,46.48,5.77,60,1.033,,70.0,
4,False,1.053,1.017,4.64,42.29,4.22,90,1.039,0.50,77.0,14729.0
...,...,...,...,...,...,...,...,...,...,...,...
36995,False,1.069,1.012,7.39,71.03,3.95,75,1.051,,70.0,
36996,False,1.063,1.016,6.22,0.00,7.41,60,1.058,,70.0,
36997,True,1.056,1.015,5.50,38.05,20.59,60,1.089,0.75,70.0,
36998,True,1.068,1.019,6.40,65.08,9.90,60,1.050,,70.0,65316.0


In [3]:
# profile = ProfileReport(df, title = "Profiling Report Classification")
# profile.to_file(output_file = "Classification_Report.html")

In [4]:
# Podział na X i y
X = df.drop(['IsIPA'], axis = 1)
y = df['IsIPA']

In [5]:
# Undersampling
# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler(random_state=0)
# X_resampled, y_resampled = rus.fit_resample(X, y)
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=.25, random_state =123)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state =123)

In [7]:
# Model LightGBM z parametrami domyślnymi
lgbm_c1 = lgb.LGBMClassifier(importance_type='gain', seed=123)
lgbm_c1.fit(X_train, y_train)
print('Accuracy Train:', lgbm_c1.score(X_train, y_train))
print('Accuracy Val:', lgbm_c1.score(X_test, y_test))

Accuracy Train: 0.8872792792792793
Accuracy Val: 0.8654054054054054


In [8]:
# Hiperparametry
params1 = {
'boosting_type': 'gbdt',
'feature_fraction':0.8,
'bagging_fraction':0.8,
'bagging_freq':10,
'max_depth':5,
'num_leaves':70,
'learning_rate':0.04,
'num_iterations':300
}

In [9]:
lgbm_c2 = lgb.LGBMClassifier(importance_type='gain', seed=123, **params1)
lgbm_c2.fit(X_train, y_train)
print('Accuracy Train:', lgbm_c2.score(X_train, y_train))
print('Accuracy Val:', lgbm_c2.score(X_test, y_test))

Accuracy Train: 0.8831351351351351
Accuracy Val: 0.8668108108108108


In [10]:
test = pd.read_csv('IPA_test.csv')
test

Unnamed: 0,OG,FG,ABV,IBU,Color,BoilTime,BoilGravity,PitchRate,Efficiency,UserId
0,1.045,1.008,4.78,27.81,4.65,60,1.037,0.50,76.0,
1,1.052,1.010,5.56,35.98,12.90,60,1.041,0.50,80.0,56565.0
2,1.079,1.021,7.64,64.83,41.03,75,1.058,1.00,70.0,15163.0
3,1.060,1.018,5.47,16.45,16.55,60,1.044,0.35,70.0,
4,1.052,1.013,5.03,37.03,44.77,60,1.038,,70.0,14759.0
...,...,...,...,...,...,...,...,...,...,...
4995,1.080,1.019,7.99,172.04,7.85,60,1.061,,70.0,
4996,1.056,1.015,5.46,32.03,12.01,60,1.044,1.25,76.0,
4997,1.096,1.024,9.41,54.34,50.00,60,1.070,,75.0,
4998,1.057,1.014,5.63,46.67,32.77,60,1.042,,70.0,19734.0


In [11]:
# Add prediction
test['prediction'] = lgbm_c2.predict(test)

In [12]:
test

Unnamed: 0,OG,FG,ABV,IBU,Color,BoilTime,BoilGravity,PitchRate,Efficiency,UserId,prediction
0,1.045,1.008,4.78,27.81,4.65,60,1.037,0.50,76.0,,False
1,1.052,1.010,5.56,35.98,12.90,60,1.041,0.50,80.0,56565.0,False
2,1.079,1.021,7.64,64.83,41.03,75,1.058,1.00,70.0,15163.0,False
3,1.060,1.018,5.47,16.45,16.55,60,1.044,0.35,70.0,,False
4,1.052,1.013,5.03,37.03,44.77,60,1.038,,70.0,14759.0,False
...,...,...,...,...,...,...,...,...,...,...,...
4995,1.080,1.019,7.99,172.04,7.85,60,1.061,,70.0,,True
4996,1.056,1.015,5.46,32.03,12.01,60,1.044,1.25,76.0,,False
4997,1.096,1.024,9.41,54.34,50.00,60,1.070,,75.0,,False
4998,1.057,1.014,5.63,46.67,32.77,60,1.042,,70.0,19734.0,False


In [13]:
test.prediction.value_counts()

False    3484
True     1516
Name: prediction, dtype: int64

In [15]:
test.prediction.to_csv('MLWizards_IPA_prediction.csv', index = False)