# `Linear Regression with Categorical Data`

----

In [183]:
import pandas as pd 
import plotly.express as px

In [184]:
data = {"salary":[3,4,5,6,7,8,9,10,3,3.4,4,5.2,6.1,6.4,6.7,7.1],
       "experience":[1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8],
       "Gender":["M","M","M","M","M","M","M","M","F","F","F","F","F","F","F","F"]}

In [185]:
data = pd.DataFrame(data)

In [186]:
data

Unnamed: 0,salary,experience,Gender
0,3.0,1,M
1,4.0,2,M
2,5.0,3,M
3,6.0,4,M
4,7.0,5,M
5,8.0,6,M
6,9.0,7,M
7,10.0,8,M
8,3.0,1,F
9,3.4,2,F


In [187]:
px.scatter(data,x="experience",y="salary",color="Gender")

In [188]:
data["Gender"].value_counts()

F    8
M    8
Name: Gender, dtype: int64

### Fitting the categorical data to model

In [189]:
X = data[["experience","Gender"]]
y = data['salary']

In [190]:
# from sklearn.linear_model import LinearRegression
# mdl = LinearRegression()
# mdl.fit(X,y)

### ` It will not work`

## Why encoding is needed, because Ml algorithms not not work on categorical data so we have to convert them to numerical data
### `Label encoding`

In [191]:
label_encoder = {"M":0,"F":1}

In [192]:
data["Gender_encoded"] = data["Gender"].apply(lambda x: label_encoder[x] if x in label_encoder else 2)

In [193]:
data

Unnamed: 0,salary,experience,Gender,Gender_encoded
0,3.0,1,M,0
1,4.0,2,M,0
2,5.0,3,M,0
3,6.0,4,M,0
4,7.0,5,M,0
5,8.0,6,M,0
6,9.0,7,M,0
7,10.0,8,M,0
8,3.0,1,F,1
9,3.4,2,F,1


In [194]:
X = data[["experience","Gender_encoded"]]
y = data['salary']

In [195]:
from sklearn.linear_model import LinearRegression
mdl = LinearRegression()
mdl.fit(X,y)

LinearRegression()

In [196]:
print('Intercept:', mdl.intercept_)
print('Slope:', mdl.coef_)

Intercept: 2.8223214285714295
Slope: [ 0.8172619 -1.2625   ]


In [197]:
y_pred = mdl.predict(X)
y_pred

array([3.63958333, 4.45684524, 5.27410714, 6.09136905, 6.90863095,
       7.72589286, 8.54315476, 9.36041667, 2.37708333, 3.19434524,
       4.01160714, 4.82886905, 5.64613095, 6.46339286, 7.28065476,
       8.09791667])

In [198]:
y

0      3.0
1      4.0
2      5.0
3      6.0
4      7.0
5      8.0
6      9.0
7     10.0
8      3.0
9      3.4
10     4.0
11     5.2
12     6.1
13     6.4
14     6.7
15     7.1
Name: salary, dtype: float64

In [199]:
from sklearn.metrics import r2_score
r2_score(y,y_pred)

0.9467572768422272

---

## `If label is reversed, just to check it gives the same result`

In [200]:
data = {"salary":[3,4,5,6,7,8,9,10,3,3.4,4,5.2,6.1,6.4,6.7,7.1],
       "experience":[1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8],
       "Gender":["M","M","M","M","M","M","M","M","F","F","F","F","F","F","F","F"]}

### `Label encoding`

In [201]:
label_encoder = {"M":1,"F":0}

In [202]:
data = pd.DataFrame(data)

In [203]:
data["Gender_encoded"] = data["Gender"].apply(lambda x: label_encoder[x] if x in label_encoder else 2)

In [204]:
data

Unnamed: 0,salary,experience,Gender,Gender_encoded
0,3.0,1,M,1
1,4.0,2,M,1
2,5.0,3,M,1
3,6.0,4,M,1
4,7.0,5,M,1
5,8.0,6,M,1
6,9.0,7,M,1
7,10.0,8,M,1
8,3.0,1,F,0
9,3.4,2,F,0


In [205]:
X = data[["experience","Gender_encoded"]]
y = data['salary']

In [206]:
from sklearn.linear_model import LinearRegression
mdl = LinearRegression()
mdl.fit(X,y)

LinearRegression()

In [207]:
print('Intercept:', mdl.intercept_)
print('Slope:', mdl.coef_)

Intercept: 1.5598214285714285
Slope: [0.8172619 1.2625   ]


In [208]:
y_pred = mdl.predict(X)
y_pred

array([3.63958333, 4.45684524, 5.27410714, 6.09136905, 6.90863095,
       7.72589286, 8.54315476, 9.36041667, 2.37708333, 3.19434524,
       4.01160714, 4.82886905, 5.64613095, 6.46339286, 7.28065476,
       8.09791667])

In [209]:
y

0      3.0
1      4.0
2      5.0
3      6.0
4      7.0
5      8.0
6      9.0
7     10.0
8      3.0
9      3.4
10     4.0
11     5.2
12     6.1
13     6.4
14     6.7
15     7.1
Name: salary, dtype: float64

In [210]:
from sklearn.metrics import r2_score
r2_score(y,y_pred)

0.9467572768422272

---

### `Comparing Both, If they predict same value`

### Here M is 0 and F is 1
- x1 is experience 
- x2 is Gender
> - Prediction for experience 1 and gender male 

In [211]:
x1 = float(input('Enter the value of x1 '))
x2 = float(input('Enter the value of x2 '))
y_pred =2.8223214285714295 + 0.8172619 * x1 + -1.2625 * x2
print(f'The predicted value y_pred={y_pred}')

Enter the value of x1 1
Enter the value of x2 0
The predicted value y_pred=3.6395833285714296


### Here M is 1 and F is 0
- x1 is experience
- x2 is Gender
> - Prediction for experience 1 and gender male

In [212]:
x11 = float(input('Enter the value of x11 '))
x22 = float(input('Enter the value of x22 '))
y_pred = 1.5598214285714285 + 0.8172619 * x11 + 1.2625 * x22
print(f'The predicted value y_pred={y_pred}')

Enter the value of x11 1
Enter the value of x22 1
The predicted value y_pred=3.6395833285714287


### `Hence order for label encoding do not matter, it gives the same result`

## `End ---------------------`