In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')


### IMPORT DATA

In [2]:
df_menengah_20 = pd.read_csv('./fix_dataset/dataset_InflasiPendidikan/Inflasi Pendidikan 2020-Menengah.csv')
df_menengah_21 = pd.read_csv('./fix_dataset/dataset_InflasiPendidikan/Inflasi Pendidikan 2021-Menengah.csv')
df_menengah_22 = pd.read_csv('./fix_dataset/dataset_InflasiPendidikan/Inflasi Pendidikan 2022-Menengah.csv')
df_menengah_23 = pd.read_csv('./fix_dataset/dataset_InflasiPendidikan/Inflasi Pendidikan 2023-Menengah.csv')

### DATA CLEANING

* Merge Data
* Check Null, Unique, Object, etc from Data

In [3]:
df_menengah_20.shape

(91, 13)

In [4]:
temp1 = pd.merge(df_menengah_20, df_menengah_21, on='Kota')
temp2 = pd.merge(df_menengah_22, df_menengah_23, on='Kota')
df_menengah = pd.merge(temp1, temp2, on='Kota')
df_menengah.head()

Unnamed: 0,Kota,01/01/2020,01/02/2020,01/03/2020,01/04/2020,01/05/2020,01/06/2020,01/07/2020,01/08/2020,01/09/2020,...,01/03/2023,01/04/2023,01/05/2023,01/06/2023,01/07/2023,01/08/2023,01/09/2023,01/10/2023,01/11/2023,01/12/2023
0,KOTA MEULABOH,0.0,0.0,0.0,0.0,0.0,0.0,4.23,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,KOTA BANDA ACEH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,KOTA LHOKSEUMAWE,0.0,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,,
3,KOTA SIBOLGA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,3.16,0.0,0.0,,
4,KOTA PEMATANG SIANTAR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.0,...,0.0,0.0,0.0,0.0,0.0,0.92,0.0,0.0,,


In [5]:
temp = df_menengah.melt(id_vars=['Kota'], var_name='Date', value_name='InflationRate')
temp['Date'] = pd.to_datetime(temp['Date'], format='%d/%m/%Y')
temp['Year'] = temp['Date'].dt.year
temp['Month'] = temp['Date'].dt.month
temp.drop('Date', axis=1, inplace=True)
df_menengah = temp[['Kota', 'Year', 'Month', 'InflationRate']]

In [6]:
df_menengah['Kota'].unique()

array(['KOTA MEULABOH', 'KOTA BANDA ACEH', 'KOTA LHOKSEUMAWE',
       'KOTA SIBOLGA', 'KOTA PEMATANG SIANTAR', 'KOTA MEDAN',
       'KOTA PADANGSIDIMPUAN', 'KOTA GUNUNGSITOLI', 'KOTA PADANG',
       'KOTA BUKITTINGGI', 'TEMBILAHAN', 'KOTA PEKANBARU', 'KOTA DUMAI',
       'BUNGO', 'KOTA JAMBI', 'KOTA PALEMBANG', 'KOTA LUBUKLINGGAU',
       'KOTA BENGKULU', 'KOTA BANDAR LAMPUNG', 'KOTA METRO',
       'TANJUNG PANDAN', 'KOTA PANGKAL PINANG', 'KOTA BATAM',
       'KOTA TANJUNG PINANG', 'DKI JAKARTA', 'KOTA BOGOR',
       'KOTA SUKABUMI', 'KOTA BANDUNG', 'KOTA CIREBON', 'KOTA BEKASI',
       'KOTA DEPOK', 'KOTA TASIKMALAYA', 'CILACAP', 'PURWOKERTO', 'KUDUS',
       'KOTA SURAKARTA', 'KOTA SEMARANG', 'KOTA TEGAL', 'KOTA YOGYAKARTA',
       'JEMBER', 'BANYUWANGI', 'SUMENEP', 'KOTA KEDIRI', 'KOTA MALANG',
       'KOTA PROBOLINGGO', 'KOTA MADIUN', 'KOTA SURABAYA',
       'KOTA TANGERANG', 'KOTA CILEGON', 'KOTA SERANG', 'SINGARAJA',
       'KOTA DENPASAR', 'KOTA MATARAM', 'KOTA BIMA', 'WAINGAPU'

In [7]:
df_menengah.isnull().sum()

Kota               0
Year               0
Month              0
InflationRate    182
dtype: int64

In [8]:
df_menengah['InflationRate'].fillna(0, inplace=True)

In [9]:
df_menengah.isnull().sum()

Kota             0
Year             0
Month            0
InflationRate    0
dtype: int64

In [10]:
df_menengah["Category"] = "Pendidikan"
df_menengah.head()

Unnamed: 0,Kota,Year,Month,InflationRate,Category
0,KOTA MEULABOH,2020,1,0.0,Pendidikan
1,KOTA BANDA ACEH,2020,1,0.0,Pendidikan
2,KOTA LHOKSEUMAWE,2020,1,0.0,Pendidikan
3,KOTA SIBOLGA,2020,1,0.0,Pendidikan
4,KOTA PEMATANG SIANTAR,2020,1,0.0,Pendidikan


### PRE-PROCESSING

* Divide Label and Features
* Splitting Data
* Standard Scaler

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [12]:
labelEncoder = LabelEncoder()
df_menengah['Kota'] = labelEncoder.fit_transform(df_menengah['Kota'])
df_menengah['Category'] = labelEncoder.fit_transform(df_menengah['Category'])

In [13]:
X = df_menengah.drop('InflationRate', axis=1)
y = df_menengah['InflationRate']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [15]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### DATA MODELLING

* Model = (Support Vector Regression) SVR
* Model = Random Forest Regression

In [16]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [17]:
model_svr = SVR(kernel='linear')
model_rfr = RandomForestRegressor(n_estimators=10, random_state=42)
model_lr = LinearRegression()
model_svr.fit(X_train, y_train)
model_rfr.fit(X_train, y_train)
model_lr.fit(X_train, y_train)

### CHECKING RESULT

* Make Y Predict
* Checking MSE, MAE, R2 Score

In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [19]:
Y_pred_svr = model_svr.predict(X_train)
Y_pred_rfr = model_rfr.predict(X_train)
Y_pred_lr = model_lr.predict(X_train)

In [20]:
#SVR
print('SVR')
print('Mean Squared Error:', mean_squared_error(y_train, Y_pred_svr))
print('Mean Absolute Error:', mean_absolute_error(y_train, Y_pred_svr))
print('R^2 Score:', r2_score(y_train, Y_pred_svr))

SVR
Mean Squared Error: 1.1188904848464913
Mean Absolute Error: 0.24674359334339827
R^2 Score: -0.0003510090634195606


In [21]:
#Random Forest Regressor
print('Random Forest Regressor')
print('Mean Squared Error:', mean_squared_error(y_train, Y_pred_rfr))
print('Mean Absolute Error:', mean_absolute_error(y_train, Y_pred_rfr))
print('R^2 Score:', r2_score(y_train, Y_pred_rfr))


Random Forest Regressor
Mean Squared Error: 0.2933883122495707
Mean Absolute Error: 0.1028400114481969
R^2 Score: 0.7376943515195432


In [22]:
#Linear Regression
print('Linear Regression')
print('Mean Squared Error:', mean_squared_error(y_train, Y_pred_lr))
print('Mean Absolute Error:', mean_absolute_error(y_train, Y_pred_lr))
print('R^2 Score:', r2_score(y_train, Y_pred_lr))

Linear Regression
Mean Squared Error: 1.1106977937235145
Mean Absolute Error: 0.2442924039684098
R^2 Score: 0.00697371747846276


Dari ketiga hasil evaluasi tersebut, kita dapat mempertimbangkan untuk menilai performa model berdasarkan kriteria tertentu:

- **Mean Squared Error (MSE):**
  - Random Forest Regressor: 0.2934
  - Linear Regression: 1.1107
  - SVR (Support Vector Regressor): 1.1189

  **Kesimpulan:** Nilai MSE yang lebih rendah menunjukkan kinerja yang lebih baik. Dengan demikian, Random Forest Regressor memiliki MSE yang paling baik.

- **Mean Absolute Error (MAE):**
  - Random Forest Regressor: 0.1028
  - Linear Regression: 0.2443
  - SVR: 0.2467

  **Kesimpulan:** Nilai MAE yang lebih rendah menunjukkan kinerja yang lebih baik. Random Forest Regressor juga memiliki MAE yang lebih rendah.

- **R^2 Score:**
  - Random Forest Regressor: 0.7377
  - Linear Regression: 0.0070
  - SVR: -0.0004
  
  **Kesimpulan:** Nilai R^2 yang lebih tinggi menunjukkan model yang lebih baik. Random Forest Regressor memiliki R^2 yang paling tinggi, menunjukkan kemampuan model dalam menjelaskan variasi dalam data.

Berdasarkan kriteria ini, secara keseluruhan, **Random Forest Regressor** tampaknya menjadi model yang paling baik di antara ketiganya.

### SAVE MODEL FOR : DF_MENENGAH

In [23]:
import joblib

joblib.dump(model_rfr, 'model_menengah.pkl')

['model_menengah.pkl']