In [1]:
#import pandas sebagai library pengolahan data
import pandas as pd

# load berbagai jenis algoritma untuk kasus regresi 
from sklearn.linear_model import LinearRegression, PassiveAggressiveRegressor, Perceptron
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

# load fungsi split dataset
from sklearn.model_selection import train_test_split

#import metric yang digunakan pada kaggle
from sklearn.metrics import mean_absolute_error

## Exploratory Data

In [2]:
# load data set yang disediakan

train_df = pd.read_csv('https://github.com/audit-ti/pjj-pengolahan-data-python/raw/main/asset/train.csv')
test_df = pd.read_csv('https://github.com/audit-ti/pjj-pengolahan-data-python/raw/main/asset/test.csv')

# cek dataset training
train_df.head()

Unnamed: 0,id,bond_id,trade_price,weight,current_coupon,time_to_maturity,is_callable,reporting_delay,trade_size,trade_type,...,received_time_diff_last9,trade_price_last9,trade_size_last9,trade_type_last9,curve_based_price_last9,received_time_diff_last10,trade_price_last10,trade_size_last10,trade_type_last10,curve_based_price_last10
0,1,62,108.692,0.478393,4.5,2.365027,0,2.007,75000,3,...,,,,,,,,,,
1,2,62,108.266,3.659075,4.5,2.362512,0,5.017,35000,2,...,,,,,,,,,,
2,3,62,108.266,0.012991,4.5,2.362512,0,5.123,35000,4,...,,,,,,,,,,
3,4,62,108.266,0.050315,4.5,2.362511,0,19.889,35000,4,...,,,,,,,,,,
4,5,62,107.902,3.495358,4.5,2.360216,0,7.109,25000,2,...,,,,,,,,,,


In [3]:
# cek dataset testing

test_df.head()

Unnamed: 0,id,weight,current_coupon,time_to_maturity,is_callable,reporting_delay,trade_size,trade_type,curve_based_price,received_time_diff_last1,...,received_time_diff_last9,trade_price_last9,trade_size_last9,trade_type_last9,curve_based_price_last9,received_time_diff_last10,trade_price_last10,trade_size_last10,trade_type_last10,curve_based_price_last10
0,108408,0.321914,6.0,3.521154,0,22.425,100000,3,100.913156,613,...,2365.0,101.254,240000.0,3.0,100.882173,2365.0,101.004,240000.0,4.0,100.882173
1,108409,0.012991,3.2,3.241684,0,9.337,5000,4,105.798081,0,...,78903.0,106.827,25000.0,3.0,105.703122,80701.0,105.848,100000.0,4.0,105.669891
2,108410,0.50583,5.5,9.195348,1,14.742,12000,3,93.658147,1515,...,2275.0,94.069,7000.0,4.0,93.698056,2290.0,95.762,7000.0,3.0,93.698056
3,108411,0.181879,4.95,1.148871,0,149.71,80000,2,104.318609,195,...,4973.0,103.181,5000.0,2.0,104.388942,7129.0,104.516,20000.0,4.0,104.387958
4,108412,3.451701,4.75,8.132487,0,3.612,20000,3,108.196294,70591,...,1020365.0,106.29,15000.0,4.0,108.481265,1020365.0,106.29,15000.0,3.0,108.481265


## Data Preprocessing

In [4]:
# imput semua missing value dengan angka 0
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [5]:
# penentuan prediktor dan target

# prediktor data training 
x_train = train_df.iloc[:,3:]

# label data training
y_train = train_df.iloc[:,2]

# prediktor untuk prediksi dataset testing
x_test = test_df.iloc[:,2:]

### Splitting Dataset
pendekatan hold out, jika CV tidak perlu karena didalamnya sudah ada split evaluation dataset

In [6]:
# split data training dan evaluation
x_train, x_eval, y_train, y_eval = train_test_split(x_train, y_train, test_size=0.15, random_state=42)

# weight training data untuk evaluasi prediksi model
weight_train = x_train['weight']
x_train = x_train.drop(['weight'],axis=1)

# weight evalution data untuk evaluasi prediksi model
weight_eval = x_eval['weight']
x_eval = x_eval.drop(['weight'],axis=1)

In [7]:
# cek dimensi seluruh dataset
print(x_train.shape, x_eval.shape, x_test.shape) 

(17340, 57) (3061, 57) (2060, 57)


## Model Building
### Fitting / Training Model

In [8]:
#untuk contoh, digunakan algoritma paling simple yaitu linear regression
model = LinearRegression()

#mulai training model
model = model.fit(x_train, y_train)

### Evaluating Model

In [9]:
# mengevaluasi model dengan data training (tidak disarankan karena akan misleading)
train_predict = model.predict(x_train)
mean_absolute_error(y_train, train_predict, sample_weight=weight_train)

1.0547711317110642

In [10]:
# mengevaluasi model dengan data evalution (hold out evaluation method)
eval_predict = model.predict(x_eval)
mean_absolute_error(y_eval, eval_predict, sample_weight=weight_eval)

1.1432710379506117

## Finishing
### Make Prediction

In [11]:
submit_predict = model.predict(x_test)

### Export File for Submission

In [12]:
test_df['trade_price'] = submit_predict
submit_df = test_df[['id','trade_price']].copy()
submit_df.to_csv('submit.csv', sep=",", index=False)