# 1. Base line (mean prediction)
# 2. Data cleaning & preparation
# 3. Linear Regression Model
# 4. Feature Engg.
# 5. Tunning
# 6. Cross validation
# 7. Residual Analysis

In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [34]:
data = {
    'StudyHours': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Attendance': [60, 65, 70, 72, 75, 80, 85, 88, 90],
    'PrevScore': [40, 45, 50, 55, 60, 65, 70, 75, 80],
    'FinalMarks': [50, 55, 60, 65, 70, 75, 78, 85, 88]
}

df=pd.DataFrame(data)
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [35]:
y=df['FinalMarks']
y

0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64

In [36]:
y_pred_baseline=np.repeat(y.mean(),len(y))

In [37]:
mae=mean_absolute_error(y,y_pred_baseline)
print(mae)

10.716049382716049


In [38]:
mse=mean_squared_error(y,y_pred_baseline)
print(mse)

154.02469135802468


In [39]:
rmse=np.sqrt(mse)

In [40]:
print(rmse)

12.410668449282847


In [41]:
r2 = r2_score(y,y_pred_baseline)

print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}")

Baseline MAE=10.72, RMSE=12.41, R-Squared=0.00


In [42]:
#clean the data
print(df.isnull().sum())

StudyHours    0
Attendance    0
PrevScore     0
FinalMarks    0
dtype: int64


In [43]:
df=df.fillna(df.mean())

In [44]:
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [45]:
# Data prep, find input, output
X=df[['StudyHours','Attendance','PrevScore']]
y=df['FinalMarks']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [46]:
# Model Creation
model=LinearRegression()

In [47]:
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [48]:
y_pred=model.predict(X_test)
print(y_pred)

[83.23783784 55.16756757 73.92972973]


In [49]:
mae=mean_absolute_error(y_test,y_pred)
print(mae)

0.9999999999999858


In [53]:
mse=mean_squared_error(y_test,y_pred) # cari perbezaan antara testing data dan prediction data
print(mse)

1.4262576089602703


In [51]:
rmse=np.sqrt(mse)
print(rmse)

1.1942602768912103


In [52]:
r2=r2_score(y_test,y_pred)
print(r2)

0.9908312010852554


In [55]:
# R-squared untuk nampak jika nombor tu menghampiri 1 ia predict yang baik

print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}") 

Baseline MAE=1.00, RMSE=1.19, R-Squared=0.99


# Base line mean prediction
2. Data cleaning and preparation
3. Linear Regression Model
4. Find out MAE, RMSE, R-Squared
5. Data set as follows

In [58]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [59]:
data={'Area':[850,900,1000,1100,1200,1500,16000,1800,2000], 
      'Bedrooms':[1,2,2,2,3,3,2,4,4],
      'Age':[1,1,3,2,1,2,2,1,2],
      'PriceIn100k':[5,6,7,7,70,9,9,11,None]
}
df=pd.DataFrame(data)
df

Unnamed: 0,Area,Bedrooms,Age,PriceIn100k
0,850,1,1,5.0
1,900,2,1,6.0
2,1000,2,3,7.0
3,1100,2,2,7.0
4,1200,3,1,70.0
5,1500,3,2,9.0
6,16000,2,2,9.0
7,1800,4,1,11.0
8,2000,4,2,


In [60]:
#clean the data
print(df.isnull().sum())

Area           0
Bedrooms       0
Age            0
PriceIn100k    1
dtype: int64


In [61]:
df=df.fillna(df.mean())

In [62]:
df

Unnamed: 0,Area,Bedrooms,Age,PriceIn100k
0,850,1,1,5.0
1,900,2,1,6.0
2,1000,2,3,7.0
3,1100,2,2,7.0
4,1200,3,1,70.0
5,1500,3,2,9.0
6,16000,2,2,9.0
7,1800,4,1,11.0
8,2000,4,2,15.5


In [81]:
X=df[['Area','Bedrooms','Age']]
y=df['PriceIn100k']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [82]:
model=LinearRegression()

In [92]:
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [94]:
y_pred=model.predict(X_test)
print(y_pred)

[ 63.78691003 -42.84839412]


In [95]:
mae=mean_absolute_error(y_test,y_pred)
print(mae)

51.317652071520314


In [96]:
mse=mean_squared_error(y_test,y_pred) # cari perbezaan antara testing data dan prediction data
print(mse)

2635.660133071676


In [97]:
rmse=np.sqrt(mse)
print(rmse)

51.338680671319125


In [98]:
r2=r2_score(y_test,y_pred)
print(r2)

-657.915033267919


In [99]:
print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}") 

Baseline MAE=51.32, RMSE=51.34, R-Squared=-657.92
