# Importing Libraries

In [72]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Visualizing Data

In [74]:
# reading in data
data = pd.read_csv('Salary_Data.csv')

#visualizing
data.head()
data.info()
data.describe()
data.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

# Data Cleanup

In [53]:
# removing duplicates - inplace means it will automatically overwrite the dataset
data.drop_duplicates(inplace=True)

# removing rows with NaN/Empty
data.dropna()

# removing salaries that are less than 0
data = data[data['Salary'] >= 0]

# One Hot Encoding

In [None]:
# one hot encoded data
one_hot_encoded_data = pd.get_dummies(data)
one_hot_encoded_data.head()

# Graphs


In [55]:
#plotting salary vs. count
plot = px.histogram(data, x = "Salary", color_discrete_sequence=['mediumorchid'])
plot.show()

In [56]:
#plotting salary vs. gender
plot = px.box(data, x = "Salary", y = "Gender", color_discrete_sequence=['green'])
plot.show()

In [57]:
# plotting salary vs. age
plot = px.scatter(data, x = "Salary", y = "Age", color_discrete_sequence=['maroon'])
plot.show()

# Training

In [None]:
x = one_hot_encoded_data.drop("Salary", axis = 1)
y = one_hot_encoded_data["Salary"]

#split 80 20
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size=0.2,random_state=21)

#split 20 to 10 10
X_train, X_val, Y_train, Y_val = train_test_split(X_test,Y_test, test_size=0.5,random_state=21)

lm = LinearRegression()
lm.fit(X_train,Y_train) # Train/fit on the training data

# Predicting

In [71]:
prediction = lm.predict(X_test)
px.scatter(x = Y_test, y = prediction)

# Evaluation

In [73]:
print('MAE= ', metrics.mean_absolute_error(Y_test,prediction) )
print('MSE= ', metrics.mean_squared_error(Y_test,prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, prediction)))

MAE=  5973.501787254445
MSE=  120252487.64533433
RMSE: 10965.969526007919
