In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('sample_data/salaryData.csv')

df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [3]:
df.dtypes

Age                    float64
Gender                  object
Education Level         object
Job Title               object
Years of Experience    float64
Salary                 float64
dtype: object

In [4]:
df.shape

(375, 6)

In [5]:
df.describe()

Unnamed: 0,Age,Years of Experience,Salary
count,373.0,373.0,373.0
mean,37.431635,10.030831,100577.345845
std,7.069073,6.557007,48240.013482
min,23.0,0.0,350.0
25%,31.0,4.0,55000.0
50%,36.0,9.0,95000.0
75%,44.0,15.0,140000.0
max,53.0,25.0,250000.0


In [6]:
df.isnull().sum()

Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64

In [7]:
df = df.dropna()

In [8]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

In [9]:
df['Job Title'].value_counts()

Director of Marketing              12
Director of Operations             11
Senior Business Analyst            10
Senior Marketing Analyst            9
Senior Marketing Manager            9
                                   ..
Business Development Manager        1
Customer Service Representative     1
IT Manager                          1
Digital Marketing Manager           1
Junior Web Developer                1
Name: Job Title, Length: 174, dtype: int64

In [10]:
df['Education Level'].value_counts()

Bachelor's    224
Master's       98
PhD            51
Name: Education Level, dtype: int64

In [11]:
df_new = df.groupby('Job Title').filter(lambda x : len(x)>4)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 47 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  147 non-null    float64
 1   Gender               147 non-null    object 
 2   Education Level      147 non-null    object 
 3   Job Title            147 non-null    object 
 4   Years of Experience  147 non-null    float64
 5   Salary               147 non-null    float64
dtypes: float64(3), object(3)
memory usage: 8.0+ KB


In [12]:
df_new['Job Title'].value_counts()

Director of Marketing                    12
Director of Operations                   11
Senior Business Analyst                  10
Senior Marketing Manager                  9
Senior Marketing Analyst                  9
Junior Business Analyst                   8
Senior Data Scientist                     7
Senior Financial Analyst                  7
Senior Project Manager                    7
Junior Business Development Associate     7
Junior Financial Analyst                  7
Junior Marketing Coordinator              6
Senior Product Manager                    6
Senior Software Engineer                  6
Junior Marketing Specialist               5
Junior Project Manager                    5
Senior Product Designer                   5
Junior Operations Analyst                 5
Senior Operations Manager                 5
Senior Project Coordinator                5
Senior Financial Manager                  5
Name: Job Title, dtype: int64

In [13]:
df_new['Job Title'] = df_new['Job Title'].astype('category')
df_new['Job Title Encoded'] = df_new['Job Title'].cat.codes
df_new.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Job Title Encoded
47,45.0,Female,Master's,Director of Marketing,16.0,180000.0,0
60,51.0,Female,Master's,Director of Operations,23.0,170000.0,1
63,47.0,Male,PhD,Senior Data Scientist,21.0,180000.0,10
69,49.0,Female,Master's,Senior Financial Analyst,18.0,150000.0,11
77,32.0,Male,Master's,Senior Software Engineer,6.0,100000.0,20


In [14]:
df_new['Education Level'] = df_new['Education Level'].astype('category')
df_new['Education Level Encoded'] = df_new['Education Level'].cat.codes
df_new['Gender'] = df_new['Gender'].astype('category')
df_new['Gender Encoded'] = df_new['Gender'].cat.codes
df_final = df_new.drop(['Gender', 'Education Level', 'Job Title'], axis=1)
df_final.head()

Unnamed: 0,Age,Years of Experience,Salary,Job Title Encoded,Education Level Encoded,Gender Encoded
47,45.0,16.0,180000.0,0,1,0
60,51.0,23.0,170000.0,1,1,0
63,47.0,21.0,180000.0,10,2,1
69,49.0,18.0,150000.0,11,1,0
77,32.0,6.0,100000.0,20,1,1


In [15]:
X = df_final.iloc[:,[0,1,3,4,5]]
y = df_final.iloc[:,2]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [17]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [18]:
y_pred = reg.predict(X_test)

In [19]:
err = np.sqrt(mean_squared_error(y_test, y_pred))

In [21]:
err

12728.77499628054

In [20]:
pickle.dump(reg, open('model.pkl', 'wb'))