# Linear Regression: Salary prediction for data science jobs

## Loading and exploring dataset

Data Source: https://www.kaggle.com/datasets/hummaamqaasim/jobs-in-data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model

In [2]:
df_all = pd.read_csv('jobs_in_data.csv')
df_all.head()

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
0,2023,Data DevOps Engineer,Data Engineering,EUR,88000,95012,Germany,Mid-level,Full-time,Hybrid,Germany,L
1,2023,Data Architect,Data Architecture and Modeling,USD,186000,186000,United States,Senior,Full-time,In-person,United States,M
2,2023,Data Architect,Data Architecture and Modeling,USD,81800,81800,United States,Senior,Full-time,In-person,United States,M
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M


In [3]:
df_all.shape

(9355, 12)

In [4]:
df_all.columns

Index(['work_year', 'job_title', 'job_category', 'salary_currency', 'salary',
       'salary_in_usd', 'employee_residence', 'experience_level',
       'employment_type', 'work_setting', 'company_location', 'company_size'],
      dtype='object')

In [5]:
df_all.isnull().sum()

work_year             0
job_title             0
job_category          0
salary_currency       0
salary                0
salary_in_usd         0
employee_residence    0
experience_level      0
employment_type       0
work_setting          0
company_location      0
company_size          0
dtype: int64

In [6]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9355 entries, 0 to 9354
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           9355 non-null   int64 
 1   job_title           9355 non-null   object
 2   job_category        9355 non-null   object
 3   salary_currency     9355 non-null   object
 4   salary              9355 non-null   int64 
 5   salary_in_usd       9355 non-null   int64 
 6   employee_residence  9355 non-null   object
 7   experience_level    9355 non-null   object
 8   employment_type     9355 non-null   object
 9   work_setting        9355 non-null   object
 10  company_location    9355 non-null   object
 11  company_size        9355 non-null   object
dtypes: int64(3), object(9)
memory usage: 877.2+ KB


## Analysis Linear Regression

In [7]:
df_all.head()

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
0,2023,Data DevOps Engineer,Data Engineering,EUR,88000,95012,Germany,Mid-level,Full-time,Hybrid,Germany,L
1,2023,Data Architect,Data Architecture and Modeling,USD,186000,186000,United States,Senior,Full-time,In-person,United States,M
2,2023,Data Architect,Data Architecture and Modeling,USD,81800,81800,United States,Senior,Full-time,In-person,United States,M
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M


In [8]:
# get data science job in dataset:
df = df_all[df_all['job_title'] == 'Data Scientist']              
df.head()

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,United States,Senior,Full-time,Remote,United States,M
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,United States,Senior,Full-time,Remote,United States,M
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,United Kingdom,Mid-level,Full-time,In-person,United Kingdom,M


In [9]:
df = df[df['employment_type'] == 'Full-time']
df.head()

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,United States,Senior,Full-time,Remote,United States,M
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,United States,Senior,Full-time,Remote,United States,M
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,United Kingdom,Mid-level,Full-time,In-person,United Kingdom,M


In [10]:
df.head()

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,United States,Senior,Full-time,Remote,United States,M
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,United States,Senior,Full-time,Remote,United States,M
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,United Kingdom,Mid-level,Full-time,In-person,United Kingdom,M


### Mapping the values of 6 columns 'work_year', 'employee_residence', 'experience_level', 'work_setting', 'company_location', 'company_size':  

In [11]:
df.columns

Index(['work_year', 'job_title', 'job_category', 'salary_currency', 'salary',
       'salary_in_usd', 'employee_residence', 'experience_level',
       'employment_type', 'work_setting', 'company_location', 'company_size'],
      dtype='object')

In [12]:
df['employee_residence'].unique()

array(['United States', 'United Kingdom', 'Spain', 'Ireland', 'Canada',
       'Netherlands', 'Greece', 'Latvia', 'Turkey', 'Italy', 'Andorra',
       'Ecuador', 'India', 'Uzbekistan', 'France', 'Switzerland',
       'Germany', 'Austria', 'Brazil', 'Cyprus', 'Romania', 'Mexico',
       'Australia', 'Belgium', 'Japan', 'Poland', 'Russia', 'Nigeria',
       'Philippines', 'Serbia', 'Singapore'], dtype=object)

In [13]:
mymap_1 = {'United States':1, 'United Kingdom':2, 'Spain':3, 'Ireland':4, 'Canada':5,
       'Netherlands':6, 'Greece':7, 'Latvia':8, 'Turkey':9, 'Italy':10, 'Andorra':11,
       'Ecuador':12, 'India':13, 'Uzbekistan':14, 'France':15, 'Switzerland':16,
       'Germany':17, 'Austria':18, 'Brazil':19, 'Cyprus':20, 'Romania':21, 'Mexico':22,
       'Australia':23, 'Belgium':24, 'Japan':25, 'Poland':26, 'Russia':27, 'Nigeria':28,
       'Philippines':29, 'Serbia':30, 'Singapore':31}
df['employee_residence'] = df[['employee_residence']].applymap(lambda s: mymap_1.get(s) if s in mymap_1 else s)

df.head()

  df['employee_residence'] = df[['employee_residence']].applymap(lambda s: mymap_1.get(s) if s in mymap_1 else s)


Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,1,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,1,Senior,Full-time,In-person,United States,M
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,1,Senior,Full-time,Remote,United States,M
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,1,Senior,Full-time,Remote,United States,M
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,2,Mid-level,Full-time,In-person,United Kingdom,M


In [14]:
df['experience_level'].unique()

array(['Senior', 'Mid-level', 'Entry-level', 'Executive'], dtype=object)

In [15]:
mymap_2 = {'Senior': 1, 'Mid-level': 2, 'Entry-level': 3, 'Executive': 4}
df['experience_level'] = df[['experience_level']].applymap(lambda s: mymap_2.get(s) if s in mymap_2 else s)

df.head()

  df['experience_level'] = df[['experience_level']].applymap(lambda s: mymap_2.get(s) if s in mymap_2 else s)


Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,1,1,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,1,1,Full-time,In-person,United States,M
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,1,1,Full-time,Remote,United States,M
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,1,1,Full-time,Remote,United States,M
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,2,2,Full-time,In-person,United Kingdom,M


In [16]:
df['work_setting'].unique()

array(['In-person', 'Remote', 'Hybrid'], dtype=object)

In [17]:
mymap_3 = {'In-person': 1, 'Remote': 2, 'Hybrid': 3}
df['work_setting'] = df[['work_setting']].applymap(lambda s: mymap_3.get(s) if s in mymap_3 else s)

df.head()

  df['work_setting'] = df[['work_setting']].applymap(lambda s: mymap_3.get(s) if s in mymap_3 else s)


Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,1,1,Full-time,1,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,1,1,Full-time,1,United States,M
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,1,1,Full-time,2,United States,M
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,1,1,Full-time,2,United States,M
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,2,2,Full-time,1,United Kingdom,M


In [18]:
df['company_location'].unique()

array(['United States', 'United Kingdom', 'Spain', 'Ireland', 'Canada',
       'Netherlands', 'Greece', 'Latvia', 'Poland', 'Italy', 'Andorra',
       'Ecuador', 'India', 'Switzerland', 'Germany', 'France', 'Brazil',
       'Estonia', 'Romania', 'Mexico', 'Australia', 'Belgium', 'Turkey',
       'Malaysia', 'Nigeria', 'Austria', 'Luxembourg', 'Israel'],
      dtype=object)

In [19]:
mymap_4 = {'United States':1, 'United Kingdom':2, 'Spain':3, 'Ireland':4, 'Canada':5,
       'Netherlands':6, 'Greece':7, 'Latvia':8, 'Poland':9, 'Italy':10, 'Andorra':11,
       'Ecuador':12, 'India':13, 'Switzerland':14, 'Germany':15, 'France':16, 'Brazil':17,
       'Estonia':18, 'Romania':19, 'Mexico':20, 'Australia':21, 'Belgium':22, 'Turkey':23,
       'Malaysia':24, 'Nigeria':25, 'Austria':26, 'Luxembourg':27, 'Israel':28}
df['company_location'] = df[['company_location']].applymap(lambda s: mymap_4.get(s) if s in mymap_4 else s)

df.head()

  df['company_location'] = df[['company_location']].applymap(lambda s: mymap_4.get(s) if s in mymap_4 else s)


Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,1,1,Full-time,1,1,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,1,1,Full-time,1,1,M
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,1,1,Full-time,2,1,M
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,1,1,Full-time,2,1,M
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,2,2,Full-time,1,2,M


In [20]:
df['company_size'].unique()

array(['M', 'L', 'S'], dtype=object)

In [21]:
mymap_5 = {'M': 1, 'L': 2, 'S': 3}
df['company_size'] = df[['company_size']].applymap(lambda s: mymap_5.get(s) if s in mymap_5 else s)

df.head()

  df['company_size'] = df[['company_size']].applymap(lambda s: mymap_5.get(s) if s in mymap_5 else s)


Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,1,1,Full-time,1,1,1
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,1,1,Full-time,1,1,1
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,1,1,Full-time,2,1,1
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,1,1,Full-time,2,1,1
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,2,2,Full-time,1,2,1


### Linear Regression

In [22]:
df.columns

Index(['work_year', 'job_title', 'job_category', 'salary_currency', 'salary',
       'salary_in_usd', 'employee_residence', 'experience_level',
       'employment_type', 'work_setting', 'company_location', 'company_size'],
      dtype='object')

In [23]:
linear_regression = linear_model.LinearRegression()

#train the model
linear_regression.fit(
    df[['work_year','employee_residence', 'experience_level', 'work_setting' , 'company_location', 'company_size']],
    df['salary_in_usd']
)

In [24]:
w = linear_regression.coef_
b = linear_regression.intercept_

w, b

(array([ 13981.54663119,  -2496.3171379 , -21463.23550619,    567.03343337,
         -2004.73382929,  -5483.46965997]),
 -28083348.259296775)

### Predict 

In [27]:
linear_regression.predict([[2023, 2, 3, 3, 3, 2]])



array([116658.19429658])

The model predicted a person from United Kingdom, experience is Entry-level, work seting is Hybrid, work in a large company in Spain can receive an annual income of 116659 USD (in Data Science jobs)

 -- End --