# 1 Base Line (mean predicton)
# 2 Data Cleaning & Preparation
# 3 Linear Regression Model
# 4 Feature Engg.
# 5 Tunning
# 6 Cross Validation
# 7 Residual Analysis

In [91]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [92]:
data = {
    'StudyHours': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Attendance': [60, 65, 70, 72, 75, 80, 85, 88, 90],
    'PrevScore': [40, 45, 50, 55, 60, 65, 70, 75, 80],
    'FinalMarks': [50, 55, 60, 65, 70, 75, 78, 85, 88]
}
df=pd.DataFrame(data)
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [93]:
y=df['FinalMarks']
y

0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64

In [94]:
y_pred_baseline=np.repeat(y.mean(), len(y))

In [95]:
mae=mean_absolute_error(y,y_pred_baseline)
print (mae)

10.716049382716049


In [96]:
mse=mean_squared_error(y,y_pred_baseline)
print (mse)

154.02469135802468


In [97]:
rmse=np.sqrt(mse)

In [98]:
r2 = r2_score(y, y_pred_baseline)
print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}")

Baseline MAE=10.72, RMSE=12.41, R-Squared=0.00


In [99]:
# clean the data
print(df.isnull().sum())

StudyHours    0
Attendance    0
PrevScore     0
FinalMarks    0
dtype: int64


In [100]:
df=df.fillna(df.mean())

In [101]:
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [102]:
# Data prep, find input, output
X = df[['StudyHours', 'Attendance', 'PrevScore']]
y = df['FinalMarks']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [103]:
# Model Creation
model=LinearRegression()

In [104]:
model.fit(X_train, y_train)

In [105]:
y_pred = model.predict(X_test)
print(y_pred)

[83.23783784 55.16756757 73.92972973]


In [106]:
mae=mean_absolute_error(y_test,y_pred)
print(mae)

0.9999999999999977


In [107]:
mse=mean_squared_error(y_test,y_pred)
print(mse)

1.4262576089603043


In [108]:
rmse=np.sqrt(mse)
print(rmse)

1.1942602768912245


In [109]:
r2=r2_score(y_test,y_pred)

In [110]:
print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}")

Baseline MAE=1.00, RMSE=1.19, R-Squared=0.99


# Data Line (mean prediction)
2. Data Cleaning & Preparation
3. Linear Regression Model
4. Find out MAE, RMSE, R-squared
5. Data Set as follows
------------------------------------------
data = {
    'Area':[850,900,1000,1100,1200,1500,1600,1800,2000],
    'Bedrooms':[1,2,2,2,3,3,2,4,4],
    'Age': [1,1,3,2,1,2,2,1,2],
    'PriceIn100K': [5,6,7,7,7,9,9,11,None]
    }

In [65]:
data = {
    'Area':[850,900,1000,1100,1200,1500,1600,1800,2000],
    'Bedrooms':[1,2,2,2,3,3,2,4,4],
    'Age': [1,1,3,2,1,2,2,1,2],
    'PriceIn100K': [5,6,7,7,7,9,9,11,None]
}
df=pd.DataFrame(data)
df

Unnamed: 0,Area,Bedrooms,Age,PriceIn100K
0,850,1,1,5.0
1,900,2,1,6.0
2,1000,2,3,7.0
3,1100,2,2,7.0
4,1200,3,1,7.0
5,1500,3,2,9.0
6,1600,2,2,9.0
7,1800,4,1,11.0
8,2000,4,2,


In [70]:
df = df.dropna()

In [71]:
y=df['PriceIn100K']
y

0     5.0
1     6.0
2     7.0
3     7.0
4     7.0
5     9.0
6     9.0
7    11.0
Name: PriceIn100K, dtype: float64

In [72]:
y_pred_baseline=np.repeat(y.mean(), len(y))

In [73]:
mae=mean_absolute_error(y,y_pred_baseline)
print (mae)

1.53125


In [74]:
mse=mean_squared_error(y,y_pred_baseline)
print (mse)

3.234375


In [75]:
rmse=np.sqrt(mse)

In [76]:
print(rmse)

1.79843682124227


In [77]:
r2 = r2_score(y, y_pred_baseline)
print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}")

Baseline MAE=1.53, RMSE=1.80, R-Squared=0.00


In [78]:
print(df.isnull().sum())

Area           0
Bedrooms       0
Age            0
PriceIn100K    0
dtype: int64


In [79]:
df=df.fillna(df.mean())

In [80]:
df

Unnamed: 0,Area,Bedrooms,Age,PriceIn100K
0,850,1,1,5.0
1,900,2,1,6.0
2,1000,2,3,7.0
3,1100,2,2,7.0
4,1200,3,1,7.0
5,1500,3,2,9.0
6,1600,2,2,9.0
7,1800,4,1,11.0


In [81]:
# Data prep, find input, output
X = df[['Area', 'Bedrooms', 'Age']]
y = df['PriceIn100K']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [82]:
model=LinearRegression()

In [83]:
model.fit(X_train, y_train)

In [84]:
y_pred = model.predict(X_test)
print(y_pred)

[4.89140271 9.47963801 3.80090498]


In [85]:
mae=mean_absolute_error(y_test,y_pred)
print(mae)

0.9291101055806944


In [86]:
mse=mean_squared_error(y_test,y_pred)
print(mse)

0.9656231444892631


In [87]:
rmse=np.sqrt(mse)
print(rmse)

0.9826612562268154


In [88]:
r2=r2_score(y_test,y_pred)

In [89]:
print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}")

Baseline MAE=0.93, RMSE=0.98, R-Squared=0.67
