In [13]:
# Import Pandas Library, used for data manipulation
# Import matplotlib, used to plot our data
# Import numpy for mathemtical operations
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [15]:
df=pd.read_csv("./dataset/SalaryDataComplete.csv")

In [17]:
df.shape

(375, 6)

In [19]:
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


In [23]:
df.describe()

Unnamed: 0,Age,Years of Experience,Salary
count,373.0,373.0,373.0
mean,37.431635,10.030831,100577.345845
std,7.069073,6.557007,48240.013482
min,23.0,0.0,350.0
25%,31.0,4.0,55000.0
50%,36.0,9.0,95000.0
75%,44.0,15.0,140000.0
max,53.0,25.0,250000.0


In [25]:
education=df['Education Level'].unique()
job=df['Job Title'].unique()
print(df['Education Level'].unique())  
print(df['Job Title'].unique())

["Bachelor's" "Master's" 'PhD' nan]
['Software Engineer' 'Data Analyst' 'Senior Manager' 'Sales Associate'
 'Director' 'Marketing Analyst' 'Product Manager' 'Sales Manager'
 'Marketing Coordinator' 'Senior Scientist' 'Software Developer'
 'HR Manager' 'Financial Analyst' 'Project Manager' 'Customer Service Rep'
 'Operations Manager' 'Marketing Manager' 'Senior Engineer'
 'Data Entry Clerk' 'Sales Director' 'Business Analyst' 'VP of Operations'
 'IT Support' 'Recruiter' 'Financial Manager' 'Social Media Specialist'
 'Software Manager' 'Junior Developer' 'Senior Consultant'
 'Product Designer' 'CEO' 'Accountant' 'Data Scientist'
 'Marketing Specialist' 'Technical Writer' 'HR Generalist'
 'Project Engineer' 'Customer Success Rep' 'Sales Executive' 'UX Designer'
 'Operations Director' 'Network Engineer' 'Administrative Assistant'
 'Strategy Consultant' 'Copywriter' 'Account Manager'
 'Director of Marketing' 'Help Desk Analyst' 'Customer Service Manager'
 'Business Intelligence Analyst' 'Ev

In [27]:
df.fillna({'Education Level': df['Education Level'].mode()[0]}, inplace=True)

In [29]:
df.fillna({'Years of Experience':df['Years of Experience'].mean()},inplace=True)

In [31]:
df.fillna({'Salary':df['Salary'].mean()},inplace=True)

In [33]:
# Initialize LabelEncoder
label_encoder_edu = LabelEncoder()
label_encoder_job = LabelEncoder()
# Encode Categorical Columns
df['Education Level'] = label_encoder_edu.fit_transform(df['Education Level'])
df['Job Title'] = label_encoder_job.fit_transform(df['Job Title'])

# Print Encoded DataFrame
print(df.head())

    Age  Gender  Education Level  Job Title  Years of Experience    Salary
0  32.0    Male                0        159                  5.0   90000.0
1  28.0  Female                1         17                  3.0   65000.0
2  45.0    Male                2        130                 15.0  150000.0
3  36.0  Female                0        101                  7.0   60000.0
4  52.0    Male                1         22                 20.0  200000.0


In [35]:
dict(zip(label_encoder_edu.classes_,label_encoder_edu.transform(label_encoder_edu.classes_)))

{"Bachelor's": 0, "Master's": 1, 'PhD': 2}

In [37]:
edu_mapping = dict(zip(education, label_encoder_edu.transform(label_encoder_edu.classes_)))
job_mapping = dict(zip(job, label_encoder_job.transform(label_encoder_job.classes_)))

print("\nEducation Level Mapping:", edu_mapping)
print("Job Title Mapping:", job_mapping)


Education Level Mapping: {"Bachelor's": 0, "Master's": 1, 'PhD': 2}
Job Title Mapping: {'Software Engineer': 0, 'Data Analyst': 1, 'Senior Manager': 2, 'Sales Associate': 3, 'Director': 4, 'Marketing Analyst': 5, 'Product Manager': 6, 'Sales Manager': 7, 'Marketing Coordinator': 8, 'Senior Scientist': 9, 'Software Developer': 10, 'HR Manager': 11, 'Financial Analyst': 12, 'Project Manager': 13, 'Customer Service Rep': 14, 'Operations Manager': 15, 'Marketing Manager': 16, 'Senior Engineer': 17, 'Data Entry Clerk': 18, 'Sales Director': 19, 'Business Analyst': 20, 'VP of Operations': 21, 'IT Support': 22, 'Recruiter': 23, 'Financial Manager': 24, 'Social Media Specialist': 25, 'Software Manager': 26, 'Junior Developer': 27, 'Senior Consultant': 28, 'Product Designer': 29, 'CEO': 30, 'Accountant': 31, 'Data Scientist': 32, 'Marketing Specialist': 33, 'Technical Writer': 34, 'HR Generalist': 35, 'Project Engineer': 36, 'Customer Success Rep': 37, 'Sales Executive': 38, 'UX Designer': 39,

In [39]:
df.columns


Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [41]:
X=df[["Education Level","Job Title","Years of Experience"]]
y=df["Salary"]

In [43]:
print(X.isnull().sum())  # Shows count of missing values per column

Education Level        0
Job Title              0
Years of Experience    0
dtype: int64


In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Apply Feature Scaling using StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [49]:
# Prediction result
y_pred_test = regressor.predict(X_test)     # predicted value of y_test
y_pred_train = regressor.predict(X_train)   # predicted value of y_train

In [51]:
# Regressor coefficients and intercept
print(f'Coefficient: {regressor.coef_}')
print(f'Intercept: {regressor.intercept_}')

Coefficient: [12155.37093403    31.8152662   6040.66768154]
Intercept: 30560.88066558128


In [53]:
y_pred_test

array([ 62864.02664279, 107312.13851549, 149903.95975919, 101122.79954148,
        76313.41845268, 161370.40002583, 164484.06321652,  82385.90140042,
       166287.12835165, 104130.611895  , 164356.8021517 ,  44646.57779957,
       182606.06626113,  83181.28305555, 106803.09425622,  89476.47286672,
        49891.86382598,  56632.46736403,  44869.284663  ,  82682.64383481,
       155181.06105181, 130308.04943061, 125752.29422216, 101918.1811966 ,
        44201.1640727 , 164865.84641098, 107789.36750857,  44232.9793389 ,
       118163.08353512,  61379.11416971,  82672.23879627,  69413.73858361,
       103791.04900529, 125221.83973522, 162070.33588234, 158411.58026877,
        50337.27755285,  53845.24542524,  65865.0667046 , 114254.03868931,
       127635.5670694 , 170492.91563185,  41212.64549813, 148981.31703925,
        42960.36869071, 130043.12226242,  83181.28305555, 158443.39553498,
        45219.25259125,  91168.85411673,  50750.87601351,  58032.33907704,
       144202.85496736, 1

In [57]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Evaluate Model Performance
mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

# Print Model Evaluation Metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.2f}")


Mean Absolute Error (MAE): 9548.21
Mean Squared Error (MSE): 151045220.90
R² Score: 0.92
RMSE: 12290.05
