# Inisiasi Kebutuhan library dan functions

In [47]:
# Import library

# library kebutuhan
import os
import datetime
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score

# library algoritma
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Macam macam algoritma

# komen paket yang tidak digunakan
algorithm_pack = [[GaussianNB(), 'Naive Bayes', 'NB'],
                [KNeighborsClassifier(), 'K Nearest Neighbors', 'KNN'],
                [SVC(), 'Support Vector Machine', 'SVM'],
                [GradientBoostingClassifier(random_state=100,), 'Gradient Boosting', 'GB'],
                [DecisionTreeClassifier(random_state=100), 'Decision Tree', 'DT'],
                [RandomForestClassifier(random_state=100), 'Random Forest','RF'],
                [MLPClassifier(max_iter=1000), 'Multi-layer Perceptron', 'MLP']]

# Fungsi untuk menampilkan opsi algoritma apa yang ingin dipakai
def algorithm_option_string():
    string = ''
    for i in range(len(algorithm_pack)):
        string += f"{i}. {algorithm_pack[i][1]}"
    string += '\n['
    for i in range(len(algorithm_pack)):
        if(i == len(algorithm_pack) - 1):
            string+= f"{i}]: "
        else:
            string+= f"{i}/"
    return string

# Fungsi untuk print error dengan warna merah dan diberi jarak
def print_error(string):
    print("\n\033[91m"+string+"\033[0m\n")

# Fungsi untuk validasi input opsi integer
def get_integer_option(min_value, max_value, prompt):
    while True:
        try:
            value = int(input(prompt))
            if min_value <= value <= max_value:
                return value
            else:
                print_error(f"Option must be between {min_value} and {max_value}")
        except ValueError:
            print_error(f"Invalid option. Please enter a valid integer value between {min_value} and {max_value}")

def get_current_time(format=None):
    """Gets the current time in Python and optionally formats it.

    Args:
        format (str, optional): The desired format for the time string.
            Defaults to None, which returns the time in ISO 8601 format (YYYY-MM-DD HH:MM:SS).
            Common format codes include:
                - %Y: Year (e.g., 2024)
                - %m: Month (e.g., 02)
                - %d: Day of the month (e.g., 27)
                - %H: Hour in 24-hour format (e.g., 16)
                - %M: Minute (e.g., 58)
                - %S: Second (e.g., 24)
                - %f: Microseconds (e.g., 406569)

    Returns:
        str: The formatted current time string.
    """

    current_time = datetime.datetime.now()

    if format:
        return current_time.strftime(format)
    else:
        return current_time.isoformat()
    
current_time = get_current_time("%d-%m-%y_%H=%M=%S")
    
def remove_file(filename):
    if os.path.exists(filename):
        os.remove(filename)

# -- Baca data dan menyimpan data original

In [48]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
origin_test_data = test_data

# Melihat Info dari train data

In [25]:
train_data.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [26]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1235 non-null   int64  
 1   surgery                1235 non-null   object 
 2   age                    1235 non-null   object 
 3   hospital_number        1235 non-null   int64  
 4   rectal_temp            1235 non-null   float64
 5   pulse                  1235 non-null   float64
 6   respiratory_rate       1235 non-null   float64
 7   temp_of_extremities    1196 non-null   object 
 8   peripheral_pulse       1175 non-null   object 
 9   mucous_membrane        1214 non-null   object 
 10  capillary_refill_time  1229 non-null   object 
 11  pain                   1191 non-null   object 
 12  peristalsis            1215 non-null   object 
 13  abdominal_distention   1212 non-null   object 
 14  nasogastric_tube       1155 non-null   object 
 15  naso

In [27]:
train_data.describe()

Unnamed: 0,id,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3
count,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0
mean,617.0,954500.4,38.202186,79.574089,30.054251,4.382591,49.602429,21.388016,3.290931,3832.496356,14.612146,3.577328
std,356.6581,1356403.0,0.788668,29.108638,16.452066,1.937357,10.5358,26.676453,1.589195,5436.733774,193.705735,88.858953
min,0.0,521399.0,35.4,30.0,8.0,1.0,23.0,3.5,0.1,0.0,0.0,0.0
25%,308.5,528800.0,37.8,53.0,18.0,2.0,43.0,6.6,2.0,2205.0,0.0,0.0
50%,617.0,529777.0,38.2,76.0,28.0,4.5,48.0,7.5,3.0,2209.0,0.0,0.0
75%,925.5,534145.0,38.6,100.0,36.0,6.0,57.0,9.1,4.3,3205.0,0.0,0.0
max,1234.0,5305129.0,40.8,184.0,96.0,7.5,75.0,89.0,10.1,41110.0,3112.0,2209.0


# -- Drop kolom yang tidak relevan dan set kolom yang ingin diprediksi

In [49]:
# Ganti isi dengan fitur yang sesuai dengan data
dropped_columns = ['id', 'hospital_number', 'lesion_2', 'lesion_3']
target_column = 'outcome'

categorical_features = train_data.drop(columns=dropped_columns + [target_column]).select_dtypes(include=['object']).columns.tolist()
numerical_features = train_data.drop(columns=dropped_columns + [target_column]).select_dtypes(exclude=['object']).columns.tolist()

train_data = train_data.drop(columns=dropped_columns)
test_data = test_data.drop(columns=dropped_columns)

# Melihat kembali apakah fitur kategorikal dan fitur numerikal sudah benar

In [29]:
print(categorical_features)
print(numerical_features)

['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data']
['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1']


# Melakukan Preprocessing sesuai kebutuhan pada data sebelum di latih

In [50]:
def null_to_none(s):
    if isinstance(s, str):
        return s
    else:
        return "none"

In [51]:
for categories in categorical_features:
    train_data[categories] =train_data[categories].apply(null_to_none)
    test_data[categories] =test_data[categories].apply(null_to_none)

In [32]:
train_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
surgery,yes,yes,yes,yes,no,no,yes,no,no,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
age,adult,adult,adult,adult,adult,adult,adult,adult,adult,adult,...,adult,young,adult,adult,adult,adult,adult,young,adult,adult
rectal_temp,38.1,37.5,38.3,37.1,38.0,38.1,38.3,39.2,37.4,38.3,...,39.3,36.1,38.5,38.3,38.0,38.5,37.5,37.5,38.1,38.1
pulse,132.0,88.0,120.0,72.0,52.0,56.0,36.0,114.0,48.0,129.0,...,54.0,60.0,48.0,48.0,136.0,129.0,60.0,84.0,70.0,54.0
respiratory_rate,24.0,12.0,28.0,30.0,48.0,32.0,16.0,24.0,12.0,48.0,...,51.0,20.0,16.0,20.0,20.0,48.0,50.0,40.0,16.0,36.0
temp_of_extremities,cool,cool,cool,cold,normal,normal,cool,cool,cool,cool,...,cool,cool,normal,normal,cool,cool,cool,normal,normal,normal
peripheral_pulse,reduced,normal,reduced,reduced,normal,normal,reduced,reduced,reduced,reduced,...,normal,reduced,normal,normal,reduced,reduced,reduced,reduced,reduced,normal
mucous_membrane,dark_cyanotic,pale_cyanotic,pale_pink,pale_pink,normal_pink,bright_pink,normal_pink,pale_cyanotic,normal_pink,pale_pink,...,bright_red,normal_pink,normal_pink,bright_pink,bright_red,pale_pink,pale_cyanotic,normal_pink,bright_red,pale_pink
capillary_refill_time,more_3_sec,more_3_sec,less_3_sec,more_3_sec,less_3_sec,less_3_sec,less_3_sec,more_3_sec,less_3_sec,less_3_sec,...,less_3_sec,less_3_sec,less_3_sec,more_3_sec,less_3_sec,more_3_sec,less_3_sec,less_3_sec,less_3_sec,less_3_sec
pain,depressed,mild_pain,extreme_pain,mild_pain,alert,depressed,severe_pain,mild_pain,alert,depressed,...,depressed,mild_pain,alert,depressed,depressed,depressed,mild_pain,mild_pain,mild_pain,mild_pain


In [33]:
test_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,814,815,816,817,818,819,820,821,822,823
surgery,no,yes,yes,no,yes,yes,yes,no,yes,no,...,no,yes,yes,yes,no,no,yes,yes,no,yes
age,adult,adult,adult,adult,adult,adult,adult,young,adult,adult,...,adult,adult,adult,adult,adult,adult,adult,adult,adult,adult
rectal_temp,38.6,38.2,37.7,37.1,38.3,38.5,40.8,40.3,38.3,38.3,...,38.6,38.1,37.7,37.2,39.6,40.3,37.2,39.2,38.3,38.1
pulse,40.0,112.0,66.0,88.0,50.0,104.0,114.0,114.0,66.0,64.0,...,130.0,84.0,88.0,60.0,128.0,114.0,100.0,132.0,54.0,66.0
respiratory_rate,20.0,48.0,12.0,20.0,12.0,36.0,36.0,36.0,12.0,22.0,...,60.0,16.0,14.0,44.0,51.0,36.0,20.0,12.0,66.0,12.0
temp_of_extremities,normal,cool,cool,cool,none,cool,cold,cool,cool,normal,...,cool,cold,cold,cold,cool,cool,cool,cool,normal,cold
peripheral_pulse,normal,reduced,normal,reduced,normal,normal,reduced,reduced,reduced,normal,...,reduced,reduced,absent,reduced,reduced,reduced,reduced,reduced,normal,normal
mucous_membrane,normal_pink,bright_pink,bright_red,pale_cyanotic,bright_pink,bright_red,dark_cyanotic,pale_cyanotic,pale_pink,bright_pink,...,pale_pink,dark_cyanotic,pale_cyanotic,pale_cyanotic,dark_cyanotic,normal_pink,pale_cyanotic,dark_cyanotic,normal_pink,normal_pink
capillary_refill_time,less_3_sec,more_3_sec,less_3_sec,less_3_sec,less_3_sec,more_3_sec,more_3_sec,more_3_sec,less_3_sec,less_3_sec,...,less_3_sec,less_3_sec,more_3_sec,more_3_sec,more_3_sec,more_3_sec,more_3_sec,more_3_sec,less_3_sec,less_3_sec
pain,mild_pain,depressed,mild_pain,depressed,mild_pain,severe_pain,depressed,depressed,mild_pain,depressed,...,depressed,depressed,depressed,depressed,depressed,depressed,extreme_pain,depressed,mild_pain,mild_pain


In [52]:
ordinal_categories = ['temp_of_extremities', 'peripheral_pulse', 'capillary_refill_time', 'pain', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen']
non_ordinal_categories = [element for element in categorical_features if element not in ordinal_categories]

In [35]:
print(non_ordinal_categories)

['surgery', 'age', 'mucous_membrane', 'peristalsis', 'abdomo_appearance', 'surgical_lesion', 'cp_data']


In [53]:
for cols in ordinal_categories:
    print(cols + ": ")
    print(train_data[cols].unique())

temp_of_extremities: 
['cool' 'cold' 'normal' 'warm' 'none']
peripheral_pulse: 
['reduced' 'normal' 'none' 'absent' 'increased']
capillary_refill_time: 
['more_3_sec' 'less_3_sec' 'none' '3']
pain: 
['depressed' 'mild_pain' 'extreme_pain' 'alert' 'severe_pain' 'none'
 'slight']
abdominal_distention: 
['slight' 'moderate' 'none' 'severe']
nasogastric_tube: 
['slight' 'none' 'significant']
nasogastric_reflux: 
['less_1_liter' 'more_1_liter' 'none' 'slight']
rectal_exam_feces: 
['decreased' 'absent' 'none' 'normal' 'increased' 'serosanguious']
abdomen: 
['distend_small' 'distend_large' 'normal' 'firm' 'none' 'other']


In [54]:
ordinal_orders = [
    ['none', 'warm', 'normal', 'cool', 'cold'], # Kolom temp_of_extremities
    ['none', 'absent', 'reduced', 'normal', 'increased'], # Kolom peripheral_pulse
    ['none', 'less_3_sec', '3', 'more_3_sec'], # Kolom capillary_refill_time
    ['none', 'slight', 'mild_pain', 'alert', 'depressed', 'severe_pain', 'extreme_pain'], # Kolom pain
    ['none', 'slight', 'moderate', 'severe'], # Kolom abdominal_distention
    ['none', 'slight', 'significant'], # Kolom nasogastric_tube
    ['none', 'slight', 'less_1_liter', 'more_1_liter'], # Kolom nasogastric_reflux
    ['none', 'absent', 'decreased', 'normal', 'increased', 'serosanguious'], # Kolom rectal_exam_feces
    ['none', 'normal', 'firm', 'distend_small', 'distend_large', 'other'] # Kolom abdomen
]

In [55]:
# Ini adalah fungsi transformasi yang digunakan untuk fitur kategorikal non ordinal
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False))
])  # OneHotEncoder mengubah kategorikal menjadi biner (numerikal)

# Ini adalah fungsi transformasi yang digunakan untuk fitur kategorikal ordinal
ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=ordinal_orders))
])

# Ini adalah fungsi transformasi yang digunakan untuk fitur numerikal non ordinal
numerical_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),   # Mengisi null value dengan data median
    ('scaler', StandardScaler())                    # Melakukan standarisasi skalar
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categories),
        ('ord', ordinal_transformer, ordinal_categories),
        ('num', numerical_imputer, numerical_features)
    ], remainder='passthrough')

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# -- Mulai memisah menjadi x_train, y_train, dan x_test (tidak perlu y_test karena yang diminta hasil prediksi bukan akurasi)

In [68]:
x_train_visualize = train_data
x_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]
y_test = test_data
x_test = test_data

In [69]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=1, shuffle=True)

In [41]:
x_train.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
surgery,yes,yes,yes,yes,no,no,yes,no,no,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
age,adult,adult,adult,adult,adult,adult,adult,adult,adult,adult,...,adult,young,adult,adult,adult,adult,adult,young,adult,adult
rectal_temp,38.1,37.5,38.3,37.1,38.0,38.1,38.3,39.2,37.4,38.3,...,39.3,36.1,38.5,38.3,38.0,38.5,37.5,37.5,38.1,38.1
pulse,132.0,88.0,120.0,72.0,52.0,56.0,36.0,114.0,48.0,129.0,...,54.0,60.0,48.0,48.0,136.0,129.0,60.0,84.0,70.0,54.0
respiratory_rate,24.0,12.0,28.0,30.0,48.0,32.0,16.0,24.0,12.0,48.0,...,51.0,20.0,16.0,20.0,20.0,48.0,50.0,40.0,16.0,36.0
temp_of_extremities,cool,cool,cool,cold,normal,normal,cool,cool,cool,cool,...,cool,cool,normal,normal,cool,cool,cool,normal,normal,normal
peripheral_pulse,reduced,normal,reduced,reduced,normal,normal,reduced,reduced,reduced,reduced,...,normal,reduced,normal,normal,reduced,reduced,reduced,reduced,reduced,normal
mucous_membrane,dark_cyanotic,pale_cyanotic,pale_pink,pale_pink,normal_pink,bright_pink,normal_pink,pale_cyanotic,normal_pink,pale_pink,...,bright_red,normal_pink,normal_pink,bright_pink,bright_red,pale_pink,pale_cyanotic,normal_pink,bright_red,pale_pink
capillary_refill_time,more_3_sec,more_3_sec,less_3_sec,more_3_sec,less_3_sec,less_3_sec,less_3_sec,more_3_sec,less_3_sec,less_3_sec,...,less_3_sec,less_3_sec,less_3_sec,more_3_sec,less_3_sec,more_3_sec,less_3_sec,less_3_sec,less_3_sec,less_3_sec
pain,depressed,mild_pain,extreme_pain,mild_pain,alert,depressed,severe_pain,mild_pain,alert,depressed,...,depressed,mild_pain,alert,depressed,depressed,depressed,mild_pain,mild_pain,mild_pain,mild_pain


In [42]:
x_test.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,814,815,816,817,818,819,820,821,822,823
surgery,no,yes,yes,no,yes,yes,yes,no,yes,no,...,no,yes,yes,yes,no,no,yes,yes,no,yes
age,adult,adult,adult,adult,adult,adult,adult,young,adult,adult,...,adult,adult,adult,adult,adult,adult,adult,adult,adult,adult
rectal_temp,38.6,38.2,37.7,37.1,38.3,38.5,40.8,40.3,38.3,38.3,...,38.6,38.1,37.7,37.2,39.6,40.3,37.2,39.2,38.3,38.1
pulse,40.0,112.0,66.0,88.0,50.0,104.0,114.0,114.0,66.0,64.0,...,130.0,84.0,88.0,60.0,128.0,114.0,100.0,132.0,54.0,66.0
respiratory_rate,20.0,48.0,12.0,20.0,12.0,36.0,36.0,36.0,12.0,22.0,...,60.0,16.0,14.0,44.0,51.0,36.0,20.0,12.0,66.0,12.0
temp_of_extremities,normal,cool,cool,cool,none,cool,cold,cool,cool,normal,...,cool,cold,cold,cold,cool,cool,cool,cool,normal,cold
peripheral_pulse,normal,reduced,normal,reduced,normal,normal,reduced,reduced,reduced,normal,...,reduced,reduced,absent,reduced,reduced,reduced,reduced,reduced,normal,normal
mucous_membrane,normal_pink,bright_pink,bright_red,pale_cyanotic,bright_pink,bright_red,dark_cyanotic,pale_cyanotic,pale_pink,bright_pink,...,pale_pink,dark_cyanotic,pale_cyanotic,pale_cyanotic,dark_cyanotic,normal_pink,pale_cyanotic,dark_cyanotic,normal_pink,normal_pink
capillary_refill_time,less_3_sec,more_3_sec,less_3_sec,less_3_sec,less_3_sec,more_3_sec,more_3_sec,more_3_sec,less_3_sec,less_3_sec,...,less_3_sec,less_3_sec,more_3_sec,more_3_sec,more_3_sec,more_3_sec,more_3_sec,more_3_sec,less_3_sec,less_3_sec
pain,mild_pain,depressed,mild_pain,depressed,mild_pain,severe_pain,depressed,depressed,mild_pain,depressed,...,depressed,depressed,depressed,depressed,depressed,depressed,extreme_pain,depressed,mild_pain,mild_pain


In [70]:
x_train_visualize = pipeline.fit_transform(x_train_visualize)
x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test)
y_test = pipeline.transform(y_test)

all_column_names = preprocessor.get_feature_names_out()
all_column_names = [name[name.find("__") + 2:] for name in all_column_names]

In [None]:
predictions = algorithm_pack[0][0].predict(x_test) #NB
predictions = algorithm_pack[1][0].predict(x_test) #KNN
predictions = algorithm_pack[4][0].predict(x_test) #GB
predictions = algorithm_pack[5][0].predict(x_test) #DT

In [71]:
algorithm_pack[0][0].fit(X_train, y_train)
predictions = algorithm_pack[0][0].predict(x_test) #NB

ValueError: could not convert string to float: 'no'

In [44]:
# algorithm_pack_2 = [[GaussianNB(), 'Naive Bayes', 'NB'],
#                 [KNeighborsClassifier(), 'K Nearest Neighbors', 'KNN'],
#                 # [SVC(), 'Support Vector Machine', 'SVM'], 
#                 [GradientBoostingClassifier(random_state=100, learning_rate=0.15), 'Gradient Boosting', 'GB'],
#                 [DecisionTreeClassifier(random_state=100), 'Decision Tree', 'DT']]

# results=[]
# for i in range (len(algorithm_pack_2)):
#     kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
#     cv_results = cross_val_score(algorithm_pack_2[i][0], x_train, y_train, cv=kfold, scoring='accuracy')
#     results.append(cv_results)
#     print('%s: %f (%f)' % (algorithm_pack_2[i][2], cv_results.mean(), cv_results.std()))

ValueError: Found input variables with inconsistent numbers of samples: [1235, 988]

In [None]:
algorithm_pack_2 = [[GaussianNB(), 'Naive Bayes', 'NB'],
                [KNeighborsClassifier(), 'K Nearest Neighbors', 'KNN'],
                # [SVC(), 'Support Vector Machine', 'SVM'], 
                [GradientBoostingClassifier(random_state=100, learning_rate=0.15), 'Gradient Boosting', 'GB'],
                [DecisionTreeClassifier(random_state=100), 'Decision Tree', 'DT']]


: 

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import classification_report
# import numpy as np

# algorithm_pack_2 = [[GaussianNB(), 'Naive Bayes', 'NB'],
#                 [KNeighborsClassifier(), 'K Nearest Neighbors', 'KNN'],
#                 # [SVC(), 'Support Vector Machine', 'SVM'], 
#                 [GradientBoostingClassifier(random_state=100, learning_rate=0.15), 'Gradient Boosting', 'GB'],
#                 [DecisionTreeClassifier(random_state=100), 'Decision Tree', 'DT']]



# results = []

# for i in range(len(algorithm_pack_2)):
#     kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
#     cv_results = cross_val_score(algorithm_pack_2[i][0], X_train, y_train, cv=kfold, scoring='accuracy')
#     results.append(cv_results)
#     print('%s: %f (%f)' % (algorithm_pack_2[i][2], cv_results.mean(), cv_results.std()))

# best_model_index = np.argmax(np.mean(results, axis=1))
# best_model_name = algorithm_pack_2[best_model_index][2]
# best_model = algorithm_pack_2[best_model_index][0]
# best_model.fit(X_train, y_train)
# predictions = best_model.predict(X_validation)
# print(f"\nClassification report for the best model which is {best_model_name}:")
# print(classification_report(y_validation, predictions))

: 

In [None]:

# results=[]
# for i in range (len(algorithm_pack_2)):
#     kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
#     cv_results = cross_val_score(algorithm_pack_2[i][0], x_train, y_train, cv=kfold, scoring='accuracy')
#     results.append(cv_results)
#     print('%s: %f (%f)' % (algorithm_pack_2[i][2], cv_results.mean(), cv_results.std()))


: 

In [None]:
print(y_test)

: 

In [None]:
# # algorithm_pack_2[1][0].fit(x_train, y_train)
# # predictions = algorithm_pack_2[1][0].predict(x_test)
# # r2_score(y_test, predictions, force_finite=False)
# for i in range (len(algorithm_pack_2)):
#     algorithm_pack_2[i][0].fit(x_train, y_train)  
#     predictions = algorithm_pack_2[i][0].predict(x_test)
#     print(classification_report(y_test, predictions))

: 

In [None]:
# algorithm_pack[2][0].fit(x_train, y_train)
# predictions = algorithm_pack[2][0].predict(x_test).round()
# print(classification_report(y_test, predictions))

: 

: 

# -- Fungsi untuk output csv, diganti berdasarkan kebutuhan

## Fungsi untuk prediksi bersifat non probabilitas

In [None]:
# def output_one_csv(i):
#     # Algoritma untuk output prediksi kedalam csv
#     algorithm_pack[i][0].fit(x_train, y_train)
#     predictions = algorithm_pack[i][0].predict(x_test)
#     ids = origin_test_data['id']
#     output_df = pd.DataFrame({'id': ids, 'outcome': predictions})
#     output_csv_name = f"predictions_{algorithm_pack[i][2]}_{current_time}.csv"
#     output_df.to_csv(output_csv_name, index=False)
    
#     # Algoritma untuk output data yang dipakai untuk melatih model
#     clean_data_df = pd.DataFrame(x_train)
#     clean_data_df.columns = all_column_names
#     clean_data_csv_name = f"clean_data_{algorithm_pack[i][2]}_{current_time}.csv"
#     clean_data_df.to_csv(clean_data_csv_name, index=False)

#     # Algoritma untuk output dokumentasi seperti permasalahan yang diselesaikan, waktu run code, dll.
#     documentation_txt_name = f"documentation_{algorithm_pack[i][2]}_{current_time}.txt"
#     current_ipynb_file = "PHOoH.ipynb"
#     string_to_write = ""
#     with open(documentation_txt_name, "w") as f:
#         # Write your documentation content to the file
#         string_to_write += f"Problem to solve\t\t: {os.path.basename(os.getcwd())}\n"
#         string_to_write += f"Date and Time file created\t: {get_current_time()}\n"
#         string_to_write += f"Algorithm title\t\t\t: {algorithm_pack[i][1]}\n\n"
#         string_to_write += f"Training data info:\n{train_data.dtypes}\n\n"
#         string_to_write += f"Test data info:\n{test_data.dtypes}\n\n"
#         string_to_write += f"Categorical Features\t: {categorical_features}\n"
#         string_to_write += f"Numerical Features\t: {numerical_features}\n\n"
#         string_to_write += f"For more data details see {clean_data_csv_name} (data that are beeing trained)\n"
#         string_to_write += f"For data preprocessing details see {current_ipynb_file} or self\n"
#         f.write(string_to_write)

#     with zipfile.ZipFile(f"complete_data_{algorithm_pack[i][2]}_{current_time}.zip", "w") as zip_file:
#         # Add the current code into the zip
#         zip_file.write(current_ipynb_file, arcname=current_ipynb_file)
#         # Add the CSV file
#         zip_file.write(output_csv_name, arcname=output_csv_name)  # Specify archive name for the CSV
#         zip_file.write(clean_data_csv_name, arcname=clean_data_csv_name)  # Specify archive name for the CSV
#         # Add the text file
#         zip_file.write(documentation_txt_name, arcname=documentation_txt_name)  # Specify archive name for the text file
#         remove_file(clean_data_csv_name)
#         remove_file(documentation_txt_name)

: 

# -- Fungsi untuk output semua model yang dipakai

In [None]:
# def output_all_csv():
#     for i in range(len(algorithm_pack)):
#         output_one_csv(i)

# def output_mandatory_csv():
#      for i in range(3):
#         output_one_csv(i)

: 

# -- Memudahkan untuk outputnya

In [None]:
# choose = get_integer_option(0, 2, "What do you want to do with the csv?\n0. Train the csv data\n1. Get the info of the csv data\n2. Get the csv of clean preprocessed data\n[0/1/2]: ")

# if(choose == 0):
#     choose = get_integer_option(0, 2, "Use all model?\n0. No\n1. Yes\n2. Mandatory\n[0/1/2]: ")
#     if(choose == 0):
#         choose = get_integer_option(0, len(algorithm_pack) - 1, f"Which model to use?"+algorithm_option_string())
#         output_one_csv(choose)
#     elif(choose == 1):
#         output_all_csv()
#     elif(choose == 2):
#         output_mandatory_csv()
# elif(choose == 1):
#     print("Training data info:\n",train_data.dtypes,"\n")
#     print("Test data info:\n",test_data.dtypes)
#     print("Categorical Features: ",categorical_features)
#     print("Numerical Features: ",numerical_features)
# elif(choose == 2):
#     clean_data_df = pd.DataFrame(x_train)
#     clean_data_df.columns = all_column_names
#     clean_data_df.to_csv('clean_data.csv', index=False)

: 

In [None]:
# Run All

: 