In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer #combine the column


In [2]:
data=pd.DataFrame({
"Experience": [1,2,3,4,5, np.nan, 7,8,9,10],
"Education": ["Bachelor's", "Bachelor's", "Master's", "PhD", "Bachelor's", np.nan, "Bachelor's", "Master's", "PhD", "Bachelor's"],
"Salary": [35000, 40000,45000, 55000, 50000, 50000, 62000, 70000,75000, 80000]
})
data

Unnamed: 0,Experience,Education,Salary
0,1.0,Bachelor's,35000
1,2.0,Bachelor's,40000
2,3.0,Master's,45000
3,4.0,PhD,55000
4,5.0,Bachelor's,50000
5,,,50000
6,7.0,Bachelor's,62000
7,8.0,Master's,70000
8,9.0,PhD,75000
9,10.0,Bachelor's,80000


In [3]:
x = data[['Experience','Education']]
y = data['Salary']

In [4]:
#strategy = mean,median,most_frequent, constant

num_cols = ["Experience"]
cat_cols = ["Education"]
numeric = Pipeline(
    steps=[
        ("i", SimpleImputer(strategy="mean")),
        ("s", StandardScaler()), ])
categorical = Pipeline(steps = [ #encode string to number
    ("i", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
 ])

In [5]:
preprocess = ColumnTransformer([("num",numeric,num_cols),("cat",categorical,cat_cols)])

model = Pipeline(steps=[("prep", preprocess),("model", LinearRegression())])

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model.fit(x_train,y_train)

In [7]:
y_pred = model.predict(x_test)
y_pred

array([80119.49987091, 36883.34009516])

In [8]:
#evaluate the model

mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
r2= r2_score(y_test,y_pred)

print("mae",round(mae,2))
print("mse",round(mse,2))
print("rme",round(rmse,2))
print("r2",round(r2,2))

mae 4118.08
mse 17961423.95
rme 4238.09
r2 0.94


In [17]:
# predict for new employee
exp=int(input("Enter the experience:"))
edu=input("Enter the education:")
new_employee=pd.DataFrame({"Experience": [exp],"Education": [edu]})
predicted_salary=model.predict(new_employee)
print("Predicted Salary:", round (predicted_salary[0],2))

Enter the experience:6
Enter the education:Bachelor's
Predicted Salary: 56978.94
