# Import Libraries

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing Datasets

In [45]:
dataset=pd.read_csv(r"./Data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [46]:
dataset.iloc[:-1]


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No


# Missing Dataset

In [47]:
dataset.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [48]:
dataset["Salary"].fillna(dataset["Salary"].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset["Salary"].fillna(dataset["Salary"].mean(),inplace=True)


In [49]:
dataset['Age'].fillna(dataset['Age'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Age'].fillna(dataset['Age'].mean(),inplace=True)


# Encoding Categorical Data

In [50]:
dummies=pd.get_dummies(dataset['Country'])
dummies

Unnamed: 0,France,Germany,Spain
0,True,False,False
1,False,False,True
2,False,True,False
3,False,False,True
4,False,True,False
5,True,False,False
6,False,False,True
7,True,False,False
8,False,True,False
9,True,False,False


In [51]:
df=pd.concat([dataset,dummies],axis=1)
df


Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,Spain
0,France,44.0,72000.0,No,True,False,False
1,Spain,27.0,48000.0,Yes,False,False,True
2,Germany,30.0,54000.0,No,False,True,False
3,Spain,38.0,61000.0,No,False,False,True
4,Germany,40.0,63777.777778,Yes,False,True,False
5,France,35.0,58000.0,Yes,True,False,False
6,Spain,38.777778,52000.0,No,False,False,True
7,France,48.0,79000.0,Yes,True,False,False
8,Germany,50.0,83000.0,No,False,True,False
9,France,37.0,67000.0,Yes,True,False,False


In [52]:
final=df.drop(['Country'],axis=1)

In [53]:
final

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,No,True,False,False
1,27.0,48000.0,Yes,False,False,True
2,30.0,54000.0,No,False,True,False
3,38.0,61000.0,No,False,False,True
4,40.0,63777.777778,Yes,False,True,False
5,35.0,58000.0,Yes,True,False,False
6,38.777778,52000.0,No,False,False,True
7,48.0,79000.0,Yes,True,False,False
8,50.0,83000.0,No,False,True,False
9,37.0,67000.0,Yes,True,False,False


In [54]:
df1=final

In [55]:
final.dtypes

Age          float64
Salary       float64
Purchased     object
France          bool
Germany         bool
Spain           bool
dtype: object

In [56]:
final['Purchased']=final['Purchased'].map(lambda x: 1 if x=='Yes' else 0)
final

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,0,True,False,False
1,27.0,48000.0,1,False,False,True
2,30.0,54000.0,0,False,True,False
3,38.0,61000.0,0,False,False,True
4,40.0,63777.777778,1,False,True,False
5,35.0,58000.0,1,True,False,False
6,38.777778,52000.0,0,False,False,True
7,48.0,79000.0,1,True,False,False
8,50.0,83000.0,0,False,True,False
9,37.0,67000.0,1,True,False,False


In [57]:
mapping = {True: 1, False: 0}
final['France']=final['France'].map(mapping)
final['Germany']=final['Germany'].map(mapping)
final['Spain']=final['Spain'].map(mapping)
final

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.777778,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


In [58]:
final.dtypes

Age          float64
Salary       float64
Purchased      int64
France         int64
Germany        int64
Spain          int64
dtype: object

# Train Test Splitting

In [59]:
from sklearn.model_selection import train_test_split
X=final.drop('Purchased',axis=1)
y=final['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [60]:
print(X_train)

         Age        Salary  France  Germany  Spain
6  38.777778  52000.000000       0        0      1
4  40.000000  63777.777778       0        1      0
0  44.000000  72000.000000       1        0      0
3  38.000000  61000.000000       0        0      1
1  27.000000  48000.000000       0        0      1
7  48.000000  79000.000000       1        0      0
8  50.000000  83000.000000       0        1      0
5  35.000000  58000.000000       1        0      0


# Fearture scalling

In [61]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
scalar.fit_transform(X_train)

array([[-0.19159184, -1.07812594, -0.77459667, -0.57735027,  1.29099445],
       [-0.01411729, -0.07013168, -0.77459667,  1.73205081, -0.77459667],
       [ 0.56670851,  0.63356243,  1.29099445, -0.57735027, -0.77459667],
       [-0.30453019, -0.30786617, -0.77459667, -0.57735027,  1.29099445],
       [-1.90180114, -1.42046362, -0.77459667, -0.57735027,  1.29099445],
       [ 1.14753431,  1.23265336,  1.29099445, -0.57735027, -0.77459667],
       [ 1.43794721,  1.57499104, -0.77459667,  1.73205081, -0.77459667],
       [-0.74014954, -0.56461943,  1.29099445, -0.57735027, -0.77459667]])

In [62]:
scalar.transform(X_test)

array([[-1.46618179, -0.9069571 , -0.77459667,  1.73205081, -0.77459667],
       [-0.44973664,  0.20564034,  1.29099445, -0.57735027, -0.77459667]])

In [64]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)


In [73]:
y_predtest=model.predict(X_test)
y_pred_train=model.predict(X_train)

In [69]:
from sklearn.metrics import mean_absolute_error
score=mean_absolute_error(y_test,y_predtest)
score


0.7580288081962444

In [75]:
score=mean_absolute_error(y_train,y_pred_train)
score

0.26100465086581914

In [None]:
# Prediction on training set
plt.scatter(X_train, y_train, color = 'lightcoral')
plt.plot(X_train, y_pred_train, color = 'firebrick')
plt.title('Salary vs Experience (Training Set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.legend(['X_train/Pred(y_test)', 'X_train/y_train'], title = 'Sal/Exp', loc='best', facecolor='white')
plt.box(False)
plt.show()