In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline


1. Business Objective: To predict prices of a house

In [2]:
house_data = pd.read_csv('homeprices_mr.csv')

In [3]:
house_data.describe()

Unnamed: 0,area,bedrooms,age,price
count,6.0,5.0,6.0,6.0
mean,3416.666667,4.2,16.5,648333.333333
std,587.934237,1.30384,8.288546,109117.673484
min,2600.0,3.0,8.0,550000.0
25%,3050.0,3.0,9.75,572500.0
50%,3400.0,4.0,16.5,602500.0
75%,3900.0,5.0,19.5,722500.0
max,4100.0,6.0,30.0,810000.0


In [4]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      6 non-null      int64  
 1   bedrooms  5 non-null      float64
 2   age       6 non-null      int64  
 3   price     6 non-null      int64  
dtypes: float64(1), int64(3)
memory usage: 320.0 bytes


Independent variable: area,bedrooms,age
Dependent variable: price


In [5]:
house_data


Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [6]:
house_data.bedrooms=house_data['bedrooms'].fillna(house_data['bedrooms'].median())

In [7]:
house_data

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


## Correlation analysis


In [8]:
house_data.corr()

Unnamed: 0,area,bedrooms,age,price
area,1.0,0.75171,-0.4453,0.901476
bedrooms,0.75171,1.0,-0.877221,0.919803
age,-0.4453,-0.877221,1.0,-0.734167
price,0.901476,0.919803,-0.734167,1.0


#### From the above table we can say that area and bedrooms are positively corelated to price
#### and the age of the flat is negatively corelated to price

# Model Building 
## Model Equation

# Price= m1*Area + m2 * bedrooms + m3 * age + c

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
lm=LinearRegression()
mlr = lm.fit(house_data[['area','bedrooms','age']],house_data['price'])

In [11]:
mlr

## Model Summary

In [12]:
mlr.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [13]:
m1= mlr.coef_[0]
m2 = mlr.coef_[1]
m3= mlr.coef_[2]

In [14]:
print(m1)
print(m2)
print(m3)

112.06244194213465
23388.880077939182
-3231.717908632958


In [15]:
c= mlr.intercept_
c

221323.00186540384

# Model Equation

## Price=  112.06 * Area +  23388.88* bed rooms  + (-3231.71)  * Age + 221323.00

# For some  area, bed rooms, age predict price of a house 


In [16]:
Price= 112.06 * 2600 + 23388.88* 3 + (-3231.71) * 20 + 221323.00
Price

518211.44

In [17]:
# error= Actual-Predicted 
error = 550000-518211.44
error

31788.559999999998

In [18]:
R_square = mlr.score(house_data[['area','bedrooms','age']],house_data['price'])
R_square

0.9550196399325819

The r-sqaure value indicates goodness of fit and its a quite good fit 

In [19]:
Price_predict= mlr.predict(house_data[['area','bedrooms','age']])
Price_predict

array([518217.63297611, 602590.07937407, 615307.4140366 , 597962.89583192,
       760663.42675457, 795258.55102673])

In [20]:
house_data['Predict_price'] = Price_predict

In [21]:
house_data

Unnamed: 0,area,bedrooms,age,price,Predict_price
0,2600,3.0,20,550000,518217.632976
1,3000,4.0,15,565000,602590.079374
2,3200,4.0,18,610000,615307.414037
3,3600,3.0,30,595000,597962.895832
4,4000,5.0,8,760000,760663.426755
5,4100,6.0,8,810000,795258.551027


In [22]:
from sklearn.metrics import mean_squared_error

In [23]:
mse = mean_squared_error(house_data.price, house_data.Predict_price)
mse

446305128.22449297

In [24]:

import math
rmse =math.sqrt(mse)
rmse

21125.934966871715

## Exercise
### The data contains hiring statics for a firm such as experience of candidate, his written test score and personal interview score. Based on these 3 factors, HR will decide the salary. 
### Given this data,  need to build a machine learning model for HR department that can help them decide salaries for future candidates. 
### Using this predict salaries for following candidates,

In [25]:
salary_data = pd.read_csv('hiring.csv')

In [26]:
salary_data.head()

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [27]:
salary_data['experience']=salary_data['experience'].fillna('zero')
salary_data['test_score(out of 10)']=salary_data['test_score(out of 10)'].fillna(salary_data['test_score(out of 10)'].median())

salary_data

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [28]:
from  word2number import w2n


In [29]:
salary_data['experience'] =  salary_data['experience'].apply(w2n.word_to_num)


In [30]:
salary_data

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [31]:
salary_data.corr()

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
experience,1.0,-0.221556,-0.29048,0.908017
test_score(out of 10),-0.221556,1.0,0.130871,0.009965
interview_score(out of 10),-0.29048,0.130871,1.0,0.03782
salary($),0.908017,0.009965,0.03782,1.0


## Model Building from statsmodel using OLS method

In [32]:
import statsmodels.api as smf

In [33]:
x_variable = smf.add_constant(salary_data[["experience","test_score(out of 10)","interview_score(out of 10)"]])
x_variable

Unnamed: 0,const,experience,test_score(out of 10),interview_score(out of 10)
0,1.0,0,8.0,9
1,1.0,0,8.0,6
2,1.0,5,6.0,7
3,1.0,2,10.0,10
4,1.0,7,9.0,6
5,1.0,3,7.0,10
6,1.0,10,8.0,7
7,1.0,11,7.0,8


In [34]:
lmr2 = smf.OLS(salary_data['salary($)'],x_variable)

In [35]:
lmr2=lmr2.fit()

In [36]:
lmr2.summary()

0,1,2,3
Dep. Variable:,salary($),R-squared:,0.962
Model:,OLS,Adj. R-squared:,0.933
Method:,Least Squares,F-statistic:,33.46
Date:,"Sun, 07 Jan 2024",Prob (F-statistic):,0.00272
Time:,21:16:33,Log-Likelihood:,-72.572
No. Observations:,8,AIC:,153.1
Df Residuals:,4,BIC:,153.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.774e+04,9503.927,1.866,0.135,-8649.867,4.41e+04
experience,2812.9549,280.977,10.011,0.001,2032.837,3593.073
test_score(out of 10),1845.7060,928.583,1.988,0.118,-732.454,4423.866
interview_score(out of 10),2205.2402,718.297,3.070,0.037,210.927,4199.554

0,1,2,3
Omnibus:,1.457,Durbin-Watson:,1.758
Prob(Omnibus):,0.483,Jarque-Bera (JB):,0.812
Skew:,-0.711,Prob(JB):,0.666
Kurtosis:,2.357,Cond. No.,111.0


In [37]:
lmr2.params

const                         17737.263464
experience                     2812.954876
test_score(out of 10)          1845.705968
interview_score(out of 10)     2205.240175
dtype: float64

In [38]:
predict_salary = lmr2.predict()
salary_data['predict_salary'] = predict_salary
salary_data

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($),predict_salary
0,0,8.0,9,50000,52350.07278
1,0,8.0,6,45000,45734.352256
2,5,6.0,7,60000,58312.954876
3,2,10.0,10,65000,63872.634643
4,7,9.0,6,70000,67270.742358
5,3,7.0,10,62000,61148.471616
6,10,8.0,7,72000,76069.141194
7,11,7.0,8,80000,79241.630277


In [39]:
from  statsmodels.tools.eval_measures import rmse 
rmse =rmse(salary_data['predict_salary'], salary_data['salary($)'], axis=0)

In [40]:
rmse

2106.1271258307565

Predict salary for 2 yr experience, 9 test score, 6 interview score

In [45]:
2812.95* 2+ 1845.70* 9+2205.24* 6+ 17737.26

53205.899999999994

In [46]:
2812.95* 12+ 1845.70* 10+2205.24* 10+ 17737.26

92002.05999999998