### In this notebook, the code used to train logistic regression and random forest models to predict TAPPI value is shared. Please read the article and README.md file for more information. Do not hessitate to reach out if you think something looks strange :)

In [None]:
# IMPORT THE NECESSARY LIBRARIES

import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, RepeatedStratifiedKFold, GridSearchCV

In [None]:
# LOAD THE DATASET

data = pd.read_excel('name_of_the_excel_sheet_with_dataset')

In [None]:
# DEFINE THE INPUTS

X = data[['name_of_the_input_columns']]

In [None]:
# DEFINE THE OUTPUT

y = data[['name_of_the_output_column']]

In [None]:
# DIVIDE THE DATASET INTO TWO AS X (INPUTS) AND Y (OUTPUT), DEFINE THE TRAIN AND TEST SET WHILE PRESERVING THE PERCENTAGE OF SAMPLES FOR EACH CLASS

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=33)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
X_train

In [None]:
# TRAIN AND SEE THE ACCURACY FOR EACH RANDOM FOREST MODEL WITH DIFFERENT HYPERPARAMETERS

model = RandomForestClassifier(max_depth=6, random_state=40) #max derinligi 10 ile kisitlayarak over fit yapmasini engelledim. 
n_estimators = range(100, 120)
max_features = ['sqrt', 'log2']

grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) 

#Repeats Stratified K-Fold n times with different randomization in each repetition.
#This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.
#n_splits, Number of folds. Must be at least 2.
#n_repeats, Number of times cross-validator needs to be repeated.

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# FIT THE BEST RANDOM FOREST MODEL TO REACH IT AFTER GRID SEARCH

grid_search.best_params_
model = grid_search.best_estimator_
model.fit(X_train, y_train)

In [None]:
# PREDICT THE TRAIN AND TEST OUTPUT TO SEE WHETHER THE RANDOM FOREST MODEL OVERFITS OT NOT

train_pred = model.predict(X_train)
y_pred = model.predict(X_test)

In [None]:
# VISUALIZE THE RESULT WITH CONFUSION MATRIX AND CALCULATE THE ACCURACY FOR TEST DATA

cm = confusion_matrix(y_test, y_pred)
ax = sns.heatmap(cm, annot=True, annot_kws={"size": 13.5}, fmt=".0f", cmap = 'RdYlBu')
ax.xaxis.set_ticklabels([1,2,3,4,5,6])
ax.yaxis.set_ticklabels([1,2,3,4,5,6])
ax.set_title('Confusion matrix', fontsize=17)
ax.set_xlabel('Predictions', fontsize=17)
ax.set_ylabel('True values', fontsize=17)
plt.show();
print('test acc:', model.score(X_test, y_test))

In [None]:
# VISUALIZE THE RESULT WITH CONFUSION MATRIX AND CALCULATE THE ACCURACY FOR TRAIN DATA

cm = confusion_matrix(y_train, train_pred)
ax = sns.heatmap(cm, annot=True, annot_kws={"size": 13.5}, fmt=".0f", cmap = 'RdYlBu')
ax.xaxis.set_ticklabels([1,2,3,4,5,6])
ax.yaxis.set_ticklabels([1,2,3,4,5,6])
ax.set_title('Confusion matrix', fontsize=17)
ax.set_xlabel('Predictions', fontsize=17)
ax.set_ylabel('True values', fontsize=17)
plt.show();
print('train acc:', model.score(X_train, y_train))

In [None]:
# SAVE THE RANDOM FOREST MODEL FOR FUTURE USE

filename = 'name_of_the_random_forest_model.sav'
joblib.dump(model, filename)

In [None]:
# TRAIN AND SEE THE ACCURACY FOR EACH LOGISTIC REGRESSION MODEL WITH DIFFERENT HYPERPARAMETERS

model_2 = LogisticRegression(multi_class='multinomial')

solvers = ['newton-cg', 'lbfgs', 'sag', 'saga']
penalty = ['l2','l1', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=44)

#Repeats Stratified K-Fold n times with different randomization in each repetition.
#This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.
#n_splits, Number of folds. Must be at least 2.
#n_repeats, Number of times cross-validator needs to be repeated.

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# FIT THE BEST LOGISTIC REGRESSION MODEL TO REACH IT AFTER GRID SEARCH

grid_search.best_params_
model_2 = grid_search.best_estimator_
model_2.fit(X_train, y_train)

In [None]:
# PREDICT THE TRAIN AND TEST OUTPUT TO SEE WHETHER THE RANDOM FOREST MODEL OVERFITS OT NOT

train_pred_2 = model_2.predict(X_train)
y_pred_2 = model_2.predict(X_test)

In [None]:
# VISUALIZE THE RESULT AND CALCULATE THE ACCURACY FOR TEST DATA

cm = confusion_matrix(y_test, y_pred_2)
ax = sns.heatmap(cm, annot=True, annot_kws={"size": 13.5}, fmt=".0f", cmap = 'RdYlBu')
ax.xaxis.set_ticklabels([1,2,3,4,5,6])
ax.yaxis.set_ticklabels([1,2,3,4,5,6])
ax.set_title('Confusion matrix', fontsize=17)
ax.set_xlabel('Predictions', fontsize=17)
ax.set_ylabel('True values', fontsize=17)
plt.show();
print('test acc:', model_2.score(X_test, y_test))

In [None]:
# VISUALIZE THE RESULT AND CALCULATE THE ACCURACY FOR TRAIN DATA

cm = confusion_matrix(y_train, train_pred_2)
ax = sns.heatmap(cm, annot=True, annot_kws={"size": 13.5}, fmt=".0f", cmap = 'RdYlBu')
ax.xaxis.set_ticklabels([1,2,3,4,5,6])
ax.yaxis.set_ticklabels([1,2,3,4,5,6])
ax.set_title('Confusion matrix', fontsize=17)
ax.set_xlabel('Predictions', fontsize=17)
ax.set_ylabel('True values', fontsize=17)
plt.show();
print('train acc:', model_2.score(X_train, y_train))

In [None]:
# SAVE THE RANDOM FOREST MODEL FOR FUTURE USE

filename = 'name_of_the_logistic_regression_model.sav'
joblib.dump(model_2, filename)