# Problem Statement

To predict the salary of the Employees based on the given features.

# Features / Independent Variables

1. rank
2. discipline
3. yrs.since.phd
4. yrs.service
5. sex

# Label / Dependent Variable

 salary

# Importing Necessary Libraries

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

#  Read csv file and convert into DataFrame

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset3/main/Salaries.csv")
df

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500
...,...,...,...,...,...,...
392,Prof,A,33,30,Male,103106
393,Prof,A,31,19,Male,150564
394,Prof,A,42,25,Male,101738
395,Prof,A,25,15,Male,95329


# Checking for shape of dataset

In [6]:
df.shape

(397, 6)

# Top 5 Rows of Dataset

In [7]:
df.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


# Bottom 5 rows of Dataset

In [8]:
df.tail()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
392,Prof,A,33,30,Male,103106
393,Prof,A,31,19,Male,150564
394,Prof,A,42,25,Male,101738
395,Prof,A,25,15,Male,95329
396,AsstProf,A,8,4,Male,81035


# Understanding the data at high level

In [9]:
df.describe()

Unnamed: 0,yrs.since.phd,yrs.service,salary
count,397.0,397.0,397.0
mean,22.314861,17.61461,113706.458438
std,12.887003,13.006024,30289.038695
min,1.0,0.0,57800.0
25%,12.0,7.0,91000.0
50%,21.0,16.0,107300.0
75%,32.0,27.0,134185.0
max,56.0,60.0,231545.0


# Getting More Info about Dataset

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   rank           397 non-null    object
 1   discipline     397 non-null    object
 2   yrs.since.phd  397 non-null    int64 
 3   yrs.service    397 non-null    int64 
 4   sex            397 non-null    object
 5   salary         397 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 18.7+ KB


# Checking for null values

In [11]:
df.isnull().sum()

rank             0
discipline       0
yrs.since.phd    0
yrs.service      0
sex              0
salary           0
dtype: int64

The above code shows that there are no null values present in the dataset

# Distribution of Categorical data

In [12]:
print(df['rank'].value_counts())
print(df['discipline'].value_counts())
print(df['sex'].value_counts())

Prof         266
AsstProf      67
AssocProf     64
Name: rank, dtype: int64
B    216
A    181
Name: discipline, dtype: int64
Male      358
Female     39
Name: sex, dtype: int64


# Encoding the Categorical data

In [13]:
# Encoding rank
df.replace({'rank':{'AsstProf':0,'AssoProf':1,'Prof':2}},inplace = True)

# Encoding discipline
df.replace({'discipline':{'A':0,'B':1}},inplace = True)

# Encoding sex
df.replace({'sex':{'Female':0,'Male':1}},inplace = True)


# Checking whether encoding is successfull or not

In [14]:
df.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,2,1,19,18,1,139750
1,2,1,20,16,1,173200
2,0,1,4,3,1,79750
3,2,1,45,39,1,115000
4,2,1,40,41,1,141500


# Dividing dataset into Features and Label

In [15]:
x = df.drop(columns = ['rank','salary'])
y = df['salary']

In [16]:
x

Unnamed: 0,discipline,yrs.since.phd,yrs.service,sex
0,1,19,18,1
1,1,20,16,1
2,1,4,3,1
3,1,45,39,1
4,1,40,41,1
...,...,...,...,...
392,0,33,30,1
393,0,31,19,1
394,0,42,25,1
395,0,25,15,1


In [17]:
y

0      139750
1      173200
2       79750
3      115000
4      141500
        ...  
392    103106
393    150564
394    101738
395     95329
396     81035
Name: salary, Length: 397, dtype: int64

In [18]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(x),columns = x.columns)

# Splitting data into training and test data

In [24]:
# splitting training and test data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state = 434)

# Training the Model

In [25]:
# Loading Linear Regression model
lr = LinearRegression()

In [26]:
lr.fit(x_train, y_train)

LinearRegression()

# Model Evaluation

In [28]:
y_predict = lr.predict(X_test)
y_predict

array([84317.78416586, 58026.75025394, 85851.24291628, 57818.50241512,
       83883.05002753, 84379.65786488, 57483.73714231, 85444.67559154,
       55222.79597397, 87839.29251084, 59259.150617  , 57818.50241512,
       28830.08703228, 86581.49537012, 84665.24782749, 59302.78585534,
       59004.4975039 , 56436.95787634, 85814.76599491, 84935.36936535,
       57998.58344035, 87661.98152153, 58389.68234034, 85009.94145321,
       84034.96423917, 86751.64804245, 59771.22687911, 55387.40857445,
       59395.59640388, 59783.92526794, 85038.1082668 , 59852.95728395,
       55657.5301123 , 84770.75676487, 59392.82636796, 87960.26987298,
       86360.54914247, 84544.27046536, 58734.37596605, 84466.92834158,
       58675.27230295, 83674.8021887 , 55747.57062492, 85892.1081187 ,
       57399.23670154, 28408.05128279, 58443.24593159, 86627.9006444 ,
       60038.57838104, 54015.23801512, 56017.69216277, 84665.24782749,
       55402.8769992 , 85149.15727602, 56393.322638  , 56539.69677779,
      

In [30]:
print(lr.coef_)

[14155.20198468  2118.68970083  -968.66405893  9072.68232292]


In [31]:
print(lr.intercept_)

69632.11701165518


In [32]:
from sklearn.metrics import r2_score

In [33]:
r2_score(y_test,y_predict)

-2.15196456807175

# Interpretation

Our model does not fit the data good