In [2]:
# Import library
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# -- Fungsi umum untuk semua permasalahan
# TODO -- Ubah argument GB untuk training data agar tidak overfitting
# Macam macam algoritma
algorithm_pack = [[GaussianNB(), 'Naive Bayes', 'NB'],
                [KNeighborsClassifier(), 'K Nearest Neighbors', 'KNN'],
                [LogisticRegression(), 'Logistic Regression', 'LR'],
                [GradientBoostingClassifier(random_state=100,
                                            n_estimators=100,
                                            learning_rate=0.15,
                                            max_features=11), 'Gradient Boosting', 'GB'],
                [DecisionTreeClassifier(), 'Decision Tree', 'DT'],
                [RandomForestClassifier(), 'Random Forest','RF']]

# Fungsi untuk menampilkan opsi algoritma apa yang ingin dipakai
def algorithm_option_string():
    string = ''
    for i in range(len(algorithm_pack)):
        string += f"\n{i}. {algorithm_pack[i][1]}"
    string += '\n['
    for i in range(len(algorithm_pack)):
        if(i == len(algorithm_pack) - 1):
            string+= f"{i}]: "
        else:
            string+= f"{i}/"
    return string

# Fungsi untuk print error dengan warna merah dan diberi jarak
def print_error(string):
    print("\n\033[91m"+string+"\033[0m\n")

# Fungsi untuk validasi input opsi integer
def get_integer_option(min_value, max_value, prompt):
    while True:
        try:
            value = int(input(prompt))
            if min_value <= value <= max_value:
                return value
            else:
                print_error(f"Option must be between {min_value} and {max_value}")
        except ValueError:
            print_error(f"Invalid option. Please enter a valid integer value between {min_value} and {max_value}")

# Baca file test sama train
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# -- Variabel umum, dimodifikasi untuk kasus khusus, Tentukan kolom yang ingin di drop dan kolom yang menjadi target
dropped_columns = ['id']
target_column = 'smoking'

# Ini untuk memilah fitur agar data mudah diolah kedepannya, fitur yang bersifat kategorikal dan fitur yang bersifat numerikal
categorical_features = train_data.drop(columns=dropped_columns + [target_column]).select_dtypes(include=['object']).columns.tolist()
numerical_features = train_data.drop(columns=dropped_columns + [target_column]).select_dtypes(exclude=['object']).columns.tolist()

# Ini adalah fungsi transformasi yang digunakan untuk fitur kategorikal non ordinal,
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),                           # Mengisi null value dengan data modus (yang sering muncul)
    # Jika kode diatas tidak dipakai, berarti menggunakan unkown sebagai fitur
    ('onehot', OneHotEncoder(sparse_output=False).set_output(transform='pandas'))   # Setelah diisi datanya
])                                                                                  # OneHotEncoder mengubah kategorikal menjadi biner (numerikal)

# Ini adalah fungsi transformasi yang digunakan untuk fitur numerikal non ordinal
numerical_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))               # Mengisi null value dengan data median
])

# Untuk menggabungkan kedua transformasi pada 1 step
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_imputer, numerical_features)
    ])

# Agar proses lebih sinkron dan mengurangi kemungkinan terjadinya kesalahan pada sinkronisasi data
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# -- Fungsi khusus untuk permasalahan



# Membersihkan data buat di training
from sklearn.model_selection import train_test_split

# Drop kolom yang tidak relevan menurut persepsi kelompok terhadap soal (train.csv)
x_train = train_data.drop(columns=dropped_columns + [target_column])
y_train = train_data[target_column]
# -- Fungsi khusus lakukan pembersihan data lanjutan sesuai logika yang dianggap benar (train.csv)



# Mengubah dari data mentah ke data yang akan dilatih (train.csv)
transformed_X_train = pipeline.fit_transform(x_train)

# Drop kolom yang tidak relevan menurut persepsi kelompok terhadap soal (test.csv)
x_test = test_data.drop(columns=dropped_columns)

# -- Fungsi khusus lakukan pembersihan data lanjutan sesuai logika yang dianggap benar (test.csv)



# Mengubah dari data mentah ke data yang akan di uji coba (test.csv)
transformed_X_test = pipeline.transform(x_test)

# -- Fungsi umum permasalahan namun harus disesuaikan dengan output yang diminta oleh soal
# fungsi fungsi berikut hanya untuk memudahkan output

def output_one_csv(i):
    algorithm_pack[i][0].fit(transformed_X_train, y_train)

    # -- Jika menggunakan predict yang bersifat probabilitas
    predictions = algorithm_pack[i][0].predict_proba(transformed_X_test)
    predictions_probabilities = predictions[:, 1]
    rounded_probabilities = [round(prob, 1) for prob in predictions_probabilities]
    # BARIS INI adalah untuk mengambil kolom ID atau unique value dari data yang ingin di test
    ids = test_data['id']
    # BARIS INI adalah untuk output data sesuai dengan yang diminta soal
    output_df = pd.DataFrame({'id': ids, 'smoking': rounded_probabilities})
    # BARIS INI untuk output csv sesuai dengan algoritma yang dipakai
    output_df.to_csv(f"predictions_{algorithm_pack[i][2]}.csv", index=False)

def output_all_csv():
    for i in range(len(algorithm_pack)):
        output_one_csv(i)

# Chose what do you want to do?
# choose = get_integer_option(0, 2, "What do you want to do with the csv?\n0. Train the csv data\n1. Get the info of the csv data\n2. Get the csv of clean preprocessed data\n[0/1/2]: ")

# if(choose == 0):
#     choose = get_integer_option(0, 1, "Use all model?\n0. No\n1. Yes\n[0/1]: ")
#     if(choose == 0):
#         choose = get_integer_option(0, len(algorithm_pack) - 1, f"Which model to use?"+algorithm_option_string())
#         output_one_csv(choose)
#     elif(choose == 1):
#         output_all_csv()
# elif(choose == 1):
#     print("Training data info:\n",train_data.dtypes,"\n")
#     print("Test data info:\n",test_data.dtypes)
#     print("Categorical Features: ",categorical_features)
#     print("Numerical Features: ",numerical_features)
# elif(choose == 2):
#     if (len(categorical_features) > 0):
#         encoded_categorical_columns = pipeline.named_steps['preprocessor'].transformers_[0][1].named_steps['onehot'].get_feature_names_out(categorical_features)
#         all_column_names = list(encoded_categorical_columns) + numerical_features
#     else:
#         all_column_names = numerical_features

#     clean_data_df = pd.DataFrame(transformed_X_train)
#     clean_data_df.columns = all_column_names
#     clean_data_df.to_csv('clean_data.csv', index=False)


In [2]:
x_train = pipeline.fit_transform(x_train)
X_train, X_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=1, shuffle=True)

In [3]:
algorithm_pack_2 = [[GaussianNB(), 'Naive Bayes', 'NB'],
                [KNeighborsClassifier(), 'K Nearest Neighbors', 'KNN'],
                # [SVC(), 'Support Vector Machine', 'SVM'], 
                [GradientBoostingClassifier(random_state=100, learning_rate=0.15), 'Gradient Boosting', 'GB'],
                [DecisionTreeClassifier(random_state=100), 'Decision Tree', 'DT']]

# results=[]
# for i in range (len(algorithm_pack_2)):
#     kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
#     cv_results = cross_val_score(algorithm_pack_2[i][0], transformed_X_train, y_train, cv=kfold, scoring='accuracy')
#     results.append(cv_results)
#     print('%s: %f (%f)' % (algorithm_pack_2[i][2], cv_results.mean(), cv_results.std()))

In [None]:
predictions = algorithm_pack[0][0].predict(x_test) #NB
predictions = algorithm_pack[1][0].predict(x_test) #KNN
predictions = algorithm_pack[4][0].predict(x_test) #GB
predictions = algorithm_pack[5][0].predict(x_test) #DT

In [6]:
algorithm_pack[5][0].fit(transformed_X_train, y_train)
predictions = algorithm_pack[5][0].predict(x_test) #NB



In [4]:
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import classification_report
# import numpy as np

# results = []

# for i in range(len(algorithm_pack_2)):
#     kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
#     cv_results = cross_val_score(algorithm_pack_2[i][0], X_train, y_train, cv=kfold, scoring='accuracy')
#     results.append(cv_results)
#     print('%s: %f (%f)' % (algorithm_pack_2[i][2], cv_results.mean(), cv_results.std()))

# best_model_index = np.argmax(np.mean(results, axis=1))
# best_model_name = algorithm_pack_2[best_model_index][2]
# best_model = algorithm_pack_2[best_model_index][0]
# best_model.fit(X_train, y_train)
# predictions = best_model.predict(X_validation)
# print(f"\nClassification report for the best model which is {best_model_name}:")
# print(classification_report(y_validation, predictions))

NB: 0.722403 (0.004312)


KeyboardInterrupt: 