# Import OptiAlgo

In [1]:
import pandas as pd
import numpy as np
from optialgo.classification import Classification
from optialgo.regression import Regression

# Klasifikasi

In [2]:
df = pd.read_csv('dataset_ex/drug200.csv')
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


## Inisiasi Fitur dan Target

In [3]:
features = ['Age','Sex','BP','Cholesterol',"Na_to_K"]
target = 'Drug'

## Inisiasi OptiAlgo

In [4]:
clf = Classification()

In [5]:
clf.fit(data=df,target=target,features=features)


            The ['drugC', 'drugB'] class has an imbalance of [0.08 0.08]
            
Consider handling class imbalance. 
            


<optialgo.classification.Classification at 0x7fbea44b9f10>

## Membagi data menjadi data latih dan data uji

In [6]:
X_train,X_test,y_train,y_test = clf.split_data(train_size=0.7)

## Mencari Algoritma Terbaik

In [7]:
clf.compare_model(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,output='dataframe',train_val=True)

Naive Bayes is run ...
K-Nearest Neighbor is run ...
SVM is run ...
Logistic Regression is run ...
Random Forest is run ...
Decision Tree Classifier is run ...
XGBoost is run ...
Gradient Boosting is run ...


Unnamed: 0,accuracy_train,acc_val,precision_train,precision_val,recall_train,recall_val,f1_train,f1_val
Naive Bayes,0.457143,0.45,0.20898,0.2025,0.457143,0.45,0.286835,0.27931
K-Nearest Neighbor,0.742857,0.6,0.750427,0.554598,0.742857,0.6,0.743124,0.576296
SVM,0.707143,0.716667,0.569949,0.539062,0.707143,0.716667,0.617174,0.608595
Logistic Regression,0.678571,0.65,0.519546,0.467509,0.678571,0.65,0.58435,0.54379
Random Forest,1.0,0.666667,1.0,0.614726,1.0,0.666667,1.0,0.62973
Decision Tree Classifier,1.0,0.733333,1.0,0.699484,1.0,0.733333,1.0,0.711263
XGBoost,1.0,0.683333,1.0,0.662573,1.0,0.683333,1.0,0.671612
Gradient Boosting,1.0,0.65,1.0,0.638929,1.0,0.65,1.0,0.6436


### Menggunakan Cross Validation

In [8]:
clf.compare_model(output='dataframe')

Naive Bayes is run ...
K-Nearest Neighbor is run ...
SVM is run ...
Logistic Regression is run ...
Random Forest is run ...
Decision Tree Classifier is run ...
XGBoost is run ...
Gradient Boosting is run ...


Unnamed: 0,fit_time,score_time,accuracy,recall,precision,f1
Naive Bayes,0.001344,0.000776,0.46,0.46,0.46,0.46
K-Nearest Neighbor,0.000521,0.002943,0.575,0.595,0.595,0.595
SVM,0.001629,0.000738,0.705,0.7,0.7,0.7
Logistic Regression,0.004531,0.000507,0.675,0.67,0.67,0.67
Random Forest,0.09847,0.004207,0.66,0.625,0.64,0.635
Decision Tree Classifier,0.00065,0.000512,0.63,0.62,0.625,0.625
XGBoost,0.074847,0.00184,0.62,0.615,0.615,0.615
Gradient Boosting,0.388461,0.001623,0.615,0.61,0.61,0.61


### Hanya Menampilkan Akurasi

In [9]:
clf.compare_model(output='only_accuracy')

Naive Bayes is run ...
K-Nearest Neighbor is run ...
SVM is run ...
Logistic Regression is run ...
Random Forest is run ...
Decision Tree Classifier is run ...
XGBoost is run ...
Gradient Boosting is run ...


{'Naive Bayes': 0.46,
 'K-Nearest Neighbor': 0.57,
 'SVM': 0.7,
 'Logistic Regression': 0.68,
 'Random Forest': 0.64,
 'Decision Tree Classifier': 0.63,
 'XGBoost': 0.62,
 'Gradient Boosting': 0.61}

## Membangun Model

### Menampilkan Algoritma yang tersedia

In [10]:
clf.get_list_models

['Naive Bayes',
 'K-Nearest Neighbor',
 'SVM',
 'Logistic Regression',
 'Random Forest',
 'Decision Tree Classifier',
 'XGBoost',
 'Gradient Boosting']

### Set Model

In [11]:
clf.set_model(X_train=X_train,y_train=y_train,algo_name="K-Nearest Neighbor")

<optialgo.classification.Classification at 0x7fbea44b9f10>

## HyperParameter

### Mencari Parameter terbaik

In [12]:
# Default Parameter
clf.get_params_from_model

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [13]:
param_grid = {'n_neighbors':[1,2,3,4,5,6,7]}
clf.find_best_params(param_grid)

{'best_params': {'n_neighbors': 3}, 'score': 0.6050000000000001}

### Tuning

In [14]:
param_tuned = {'n_neighbors':3}
clf.tuning(param_tuned)

<optialgo.classification.Classification at 0x7fbea44b9f10>

In [15]:
clf.get_params_from_model

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 3,
 'p': 2,
 'weights': 'uniform'}

## Prediksi

In [16]:
clf.predict(X_test,output='dataframe')

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,pred
0,0.694915,1.0,0.0,0.0,0.598099,DrugY
1,0.983051,1.0,0.0,0.0,0.377728,DrugY
2,0.050847,1.0,0.0,0.0,0.077585,drugX
3,0.966102,1.0,0.0,0.0,0.261836,drugX
4,0.254237,1.0,0.0,0.0,0.130527,drugX
5,0.186441,1.0,0.0,0.0,0.246763,drugX
6,0.389831,1.0,0.0,0.0,0.738195,DrugY
7,0.881356,0.0,1.0,1.0,0.101476,drugB
8,0.779661,0.0,1.0,1.0,0.099256,drugX
9,0.050847,1.0,0.0,0.0,0.966883,DrugY


# Regresi

In [17]:
df = pd.read_csv('dataset_ex/Housing.csv')
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,,2,3,yes,no,no,no,yes,2,yes,
1,12250000,8960,4.0,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3.0,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4.0,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4.0,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2.0,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3.0,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2.0,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3.0,1,1,no,no,no,no,no,0,no,furnished


## Inisiasi Fitur dan Target

In [18]:
features = df.columns.tolist()[1:]
target = 'price'

## Inisiasi Optialgo

In [19]:
reg = Regression()
reg.fit(data=df,features=features,target=target)

ValueError: Missing Value in {'bedrooms': 1, 'furnishingstatus': 1}

Didalam OptiAlgo, data tidak boleh ada missing value, namun jangan khawatir !
OptiAlgo menyediakan fitur handling missing values

In [20]:
df_clean = reg.handling_missing_values(df)
df_clean

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,3.0,2,3,yes,no,no,no,yes,2,yes,semi-furnished
1,12250000,8960,4.0,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3.0,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4.0,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4.0,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2.0,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3.0,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2.0,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3.0,1,1,no,no,no,no,no,0,no,furnished


Sekarang data anda sudah bersih dari missing values !

In [21]:
reg.fit(data=df_clean,features=features,target=target)

<optialgo.regression.Regression at 0x7fbea44c3cd0>

## Membagi data menjadi data latih dan data uji

In [22]:
X_train,X_test,y_train,y_test = reg.split_data(train_size=0.75)

## Mencari Algoritma Terbaik

In [23]:
reg.compare_model(X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test,output='dataframe',train_val=True)

Linear Regression is run ...
SVR is run ...
K-Neighbors Regressor is run ...
Random Forest Regressor is run ...
Decision Tree Regressor is run ...
XGBoost Regressor is run ...
GradientBoosting Regressor is run ...


Unnamed: 0,mae_train,mae_val,mse_train,mse_val,mape_train,mape_val,difference_mape
Linear Regression,845654.6,1052865.0,1325097000000.0,2005149000000.0,0.189011,0.252891,-6.38801
SVR,1312762.0,1659450.0,3285492000000.0,4831772000000.0,0.276244,0.3733,-9.705597
K-Neighbors Regressor,742785.4,1068630.0,1092181000000.0,2236335000000.0,0.164096,0.249603,-8.550697
Random Forest Regressor,354808.0,1164328.0,240872700000.0,2757883000000.0,0.079921,0.273015,-19.309363
Decision Tree Regressor,54284.31,1442901.0,61170620000.0,4597268000000.0,0.012968,0.32401,-31.104195
XGBoost Regressor,130761.1,1234672.0,75292850000.0,2933075000000.0,0.031377,0.283606,-25.222893
GradientBoosting Regressor,597161.1,1133305.0,627418300000.0,2413456000000.0,0.137897,0.268301,-13.040421


### Menggunakan Cross Validation

In [24]:
reg.compare_model(output='dataframe')

Linear Regression is run ...
SVR is run ...
K-Neighbors Regressor is run ...
Random Forest Regressor is run ...
Decision Tree Regressor is run ...
XGBoost Regressor is run ...
GradientBoosting Regressor is run ...


Unnamed: 0,fit_time,score_time,mean_absolute_percentage_error,mean_squared_error,mean_absolute_error,root_mean_squared_error
Linear Regression,0.001236,0.000722,0.206469,2612947000000.0,1205205.0,1455557.0
SVR,0.011594,0.002956,0.301818,4903557000000.0,1736836.0,1821774.0
K-Neighbors Regressor,0.000728,0.001529,0.212477,3254028000000.0,1375879.0,1630304.0
Random Forest Regressor,0.154156,0.005904,0.218378,3288054000000.0,1419913.0,1686784.0
Decision Tree Regressor,0.001456,0.000492,0.269392,4486609000000.0,1674975.0,2008750.0
XGBoost Regressor,0.10717,0.005522,0.225517,3493089000000.0,1449336.0,1751516.0
GradientBoosting Regressor,0.06003,0.000691,0.206892,2972908000000.0,1313886.0,1541976.0


### Hanya Menampilkan Mean Absolute Error

In [25]:
reg.compare_model(output='only_mape')

Linear Regression is run ...
SVR is run ...
K-Neighbors Regressor is run ...
Random Forest Regressor is run ...
Decision Tree Regressor is run ...
XGBoost Regressor is run ...
GradientBoosting Regressor is run ...


{'Linear Regression': 0.21,
 'SVR': 0.3,
 'K-Neighbors Regressor': 0.21,
 'Random Forest Regressor': 0.22,
 'Decision Tree Regressor': 0.26,
 'XGBoost Regressor': 0.23,
 'GradientBoosting Regressor': 0.21}

## Membangun Model

### Menampilkan Algoritma yang tersedia

In [26]:
reg.get_list_models

['Linear Regression',
 'SVR',
 'K-Neighbors Regressor',
 'Random Forest Regressor',
 'Decision Tree Regressor',
 'XGBoost Regressor',
 'GradientBoosting Regressor']

### Set Model

In [27]:
reg.set_model(X_train=X_train,y_train=y_train,algo_name="K-Neighbors Regressor")

<optialgo.regression.Regression at 0x7fbea44c3cd0>

## HyperParameter

### Mencari Parameter terbaik

In [28]:
# Default Parameter
reg.get_params_from_model

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [29]:
param_grid = {'n_neighbors':[1,2,3,4,5,6,7]}
best_params = reg.find_best_params(param_grid=param_grid)
best_params

{'best_params': {'n_neighbors': 6}, 'score': 0.2973024722660725}

### Tuning

In [30]:
reg.tuning(best_params['best_params'])

<optialgo.regression.Regression at 0x7fbea44c3cd0>

## Prediksi

In [31]:
reg.predict(X_test)

array([4361000.        , 6427166.66666667, 3477833.33333333,
       4362166.66666667, 4043083.33333333, 5026000.        ,
       5265166.66666667, 4754166.66666667, 2852500.        ,
       2794166.66666667, 7961823.33333333, 3553666.66666667,
       3336666.66666667, 3249166.66666667, 3714666.66666667,
       4466000.        , 3970166.66666667, 4965333.33333333,
       4662000.        , 3739166.66666667, 5576666.66666667,
       5026000.        , 2934166.66666667, 4532500.        ,
       4922166.66666667, 6084166.66666667, 2758000.        ,
       4165000.        , 6179833.33333333, 3970166.66666667,
       4165000.        , 3948000.        , 6816833.33333333,
       5057500.        , 4405333.33333333, 7385000.        ,
       5146166.66666667, 3736250.        , 3120833.33333333,
       4165000.        , 5231333.33333333, 2934166.66666667,
       6427166.66666667, 4532500.        , 3739166.66666667,
       3958500.        , 7133000.        , 4368000.        ,
       3995833.33333333,

# Preprocessing Data

In [32]:
df_ = pd.read_csv('dataset_ex/Data Kebotakan.csv')
df_['is_botak'] = df_['botak_prob'].apply(lambda x : 1 if x > 0.5 else 0)
df_

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob,is_botak
0,27.0,Perempuan,PNS,Bengkulu,7.957453e+06,1.0,0.0,54.315053,170.428542,Pantone,1.0,S1,5.0,0.605974,1
1,53.0,Perempuan,PNS,Bandung,7.633003e+06,1.0,0.0,72.873404,165.530097,Pantone,0.0,S1,7.0,0.532860,1
2,37.0,Perempuan,Pegawai swasta,Bandung,6.637625e+06,1.0,0.0,46.321533,154.599388,Moonsilk,0.0,S1,4.0,0.418442,0
3,36.0,Perempuan,Pengangguran,Palu,3.624871e+06,1.0,0.0,51.539781,167.340481,Deadbuoy,1.0,SD,9.0,0.804050,1
4,38.0,Laki-laki,Freelance,Palangkaraya,6.031808e+06,1.0,0.0,60.726909,165.514773,Merpati,1.0,S2,1.0,0.368371,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7912,32.0,Laki-laki,Pegawai swasta,Yogyakarta,6.024409e+06,1.0,0.0,44.432438,154.578859,Deadbuoy,0.0,SMA,9.0,0.471229,0
7913,34.0,Laki-laki,Pegawai swasta,Manado,1.007043e+07,1.0,0.0,49.389914,158.782726,Shoulder & Head,1.0,S1,9.0,0.543821,1
7914,58.0,Laki-laki,Freelance,Mamuju,8.010815e+06,1.0,0.0,54.351968,154.478087,Deadbuoy,1.0,S2,6.0,0.643453,1
7915,30.0,,PNS,Palu,9.059906e+06,1.0,0.0,57.646930,163.377717,Deadbuoy,0.0,S1,10.0,0.540056,1


In [33]:
features = df_.columns.tolist()[:len(df_.columns.tolist())-2]
target = 'is_botak'

## Handling Missing Values

In [34]:
df_.isna().sum()

umur              85
jenis_kelamin     77
pekerjaan        157
provinsi          86
gaji              74
is_menikah        72
is_keturunan      89
berat             56
tinggi            74
sampo             59
is_merokok        70
pendidikan        70
stress            64
botak_prob        79
is_botak           0
dtype: int64

In [35]:
clf = Classification()
df_clean = clf.handling_missing_values(df_)
df_clean

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob,is_botak
0,27.0,Perempuan,PNS,Bengkulu,7.957453e+06,1.0,0.0,54.315053,170.428542,Pantone,1.0,S1,5.0,0.605974,1
1,53.0,Perempuan,PNS,Bandung,7.633003e+06,1.0,0.0,72.873404,165.530097,Pantone,0.0,S1,7.0,0.532860,1
2,37.0,Perempuan,Pegawai swasta,Bandung,6.637625e+06,1.0,0.0,46.321533,154.599388,Moonsilk,0.0,S1,4.0,0.418442,0
3,36.0,Perempuan,Pengangguran,Palu,3.624871e+06,1.0,0.0,51.539781,167.340481,Deadbuoy,1.0,SD,9.0,0.804050,1
4,38.0,Laki-laki,Freelance,Palangkaraya,6.031808e+06,1.0,0.0,60.726909,165.514773,Merpati,1.0,S2,1.0,0.368371,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7912,32.0,Laki-laki,Pegawai swasta,Yogyakarta,6.024409e+06,1.0,0.0,44.432438,154.578859,Deadbuoy,0.0,SMA,9.0,0.471229,0
7913,34.0,Laki-laki,Pegawai swasta,Manado,1.007043e+07,1.0,0.0,49.389914,158.782726,Shoulder & Head,1.0,S1,9.0,0.543821,1
7914,58.0,Laki-laki,Freelance,Mamuju,8.010815e+06,1.0,0.0,54.351968,154.478087,Deadbuoy,1.0,S2,6.0,0.643453,1
7915,30.0,Laki-laki,PNS,Palu,9.059906e+06,1.0,0.0,57.646930,163.377717,Deadbuoy,0.0,S1,10.0,0.540056,1


In [36]:
df_clean.isna().sum()

umur             0
jenis_kelamin    0
pekerjaan        0
provinsi         0
gaji             0
is_menikah       0
is_keturunan     0
berat            0
tinggi           0
sampo            0
is_merokok       0
pendidikan       0
stress           0
botak_prob       0
is_botak         0
dtype: int64

## Mengatasi Data yang tidak seimbang

In [37]:
clf.fit(data=df_clean,features=features,target=target)


            The [0] class has an imbalance of [0.35531136]
            
Consider handling class imbalance. 
            


<optialgo.classification.Classification at 0x7fbe9e53aa10>

dapat dilihat output diatas, data yang dimasukkan tidak seimbang. di optialgo ada fitur untuk menyeimbangkan data menggunakan metode sampling

In [38]:
X_train,X_test,y_train,y_test = clf.split_data(train_size=0.2)

In [39]:
X_train,y_train = clf.sampling(method='SMOTE',X = X_train,y = y_train)

In [40]:
np.unique(y_train,return_counts=True)

(array([0, 1]), array([1021, 1021]))

Nah, data latih sudah memiliki kelas yang seimbang, data latih sudah siap digunakan !