# Tugas Regression

Tugas kali ini, mencari Hasil Evaluasi MAE, MSE, RMSE, dan R2 Score, ketika data memiliki kolom state yang bertipe String.

In [7]:
# Import Package yang akan digunakan dalam project
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [8]:
# Import Data yang akan dianalisa
df = pd.read_csv('50_Startups.csv')

df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [9]:
# Slice Data x, y
xm = df.iloc[:,0:4].values
ym = df.iloc[:,-1].values

# Reshape ym
ym = ym.reshape(len(ym), 1)
ym.shape

# Tugas Masukkan State dengan model itu, 
# catatan State ubah menjadi nilai numerik atau lakukan encoding di state, pake One Hot Encoder bisa, 
# One Hot Encoder bisa langsung di kolom Pandas dengan method ColumnTransformer

# Ubah nilai State yang awalnya String menjadi Numerik dalam tabel
tf = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')

xm = np.array(tf.fit_transform(xm))

xm

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Xm_train, Xm_test, ym_train, ym_test = train_test_split(xm, ym, test_size=0.2, random_state=30)

print("Jumlah X Train :", len(Xm_train))
print("Jumlah X Test :", len(Xm_test))

Jumlah X Train : 40
Jumlah X Test : 10


In [11]:
mlr = LinearRegression()

# Fit Feature
mlr.fit(Xm_train, ym_train)

# Predict
ym_pred = mlr.predict(Xm_test)

# Compare Actual value vs Predict Value
comp = np.concatenate((ym_test, ym_pred), axis=1)

comp

array([[ 77798.83      ,  74204.24896079],
       [118474.03      , 116566.94205365],
       [ 97427.84      ,  96481.75206871],
       [ 69758.98      ,  57440.67122773],
       [146121.95      , 134531.20435249],
       [101004.64      ,  98963.15266496],
       [ 96778.92      ,  97302.53943633],
       [107404.34      , 100426.50977313],
       [ 96712.8       ,  87823.65877003],
       [122776.86      , 111618.76401953]])

In [12]:
# Model Evaluation

from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

mae = mean_absolute_error(ym_test, ym_pred)
mse = mean_squared_error(ym_test, ym_pred)
rmse = mean_squared_error(ym_test, ym_pred, squared=False)
r2 = r2_score(ym_test, ym_pred)

print(f'Hasil Evaluasi MAE = {mae}')
print(f'Hasil Evaluasi MSE = {mse}')
print(f'Hasil Evaluasi RMSE = {rmse}')
print(f'Hasil Evaluasi R2 = {r2}')

Hasil Evaluasi MAE = 5994.698554530379
Hasil Evaluasi MSE = 56019109.55744183
Hasil Evaluasi RMSE = 7484.591475654622
Hasil Evaluasi R2 = 0.8715447367657808
