# Import Libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing Datasets

In [18]:
dataset=pd.read_csv(r"./Data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [19]:
dataset.iloc[:-1]


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No


# Missing Dataset

In [20]:
dataset.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [21]:
dataset["Salary"].fillna(dataset["Salary"].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset["Salary"].fillna(dataset["Salary"].mean(),inplace=True)


# Encoding Categorical Data

In [22]:
dummies=pd.get_dummies(dataset['Country'])
dummies

Unnamed: 0,France,Germany,Spain
0,True,False,False
1,False,False,True
2,False,True,False
3,False,False,True
4,False,True,False
5,True,False,False
6,False,False,True
7,True,False,False
8,False,True,False
9,True,False,False


In [23]:
df=pd.concat([dataset,dummies],axis=1)
df


Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,Spain
0,France,44.0,72000.0,No,True,False,False
1,Spain,27.0,48000.0,Yes,False,False,True
2,Germany,30.0,54000.0,No,False,True,False
3,Spain,38.0,61000.0,No,False,False,True
4,Germany,40.0,63777.777778,Yes,False,True,False
5,France,35.0,58000.0,Yes,True,False,False
6,Spain,,52000.0,No,False,False,True
7,France,48.0,79000.0,Yes,True,False,False
8,Germany,50.0,83000.0,No,False,True,False
9,France,37.0,67000.0,Yes,True,False,False


In [24]:
final=df.drop(['Country'],axis=1)

In [25]:
final

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,No,True,False,False
1,27.0,48000.0,Yes,False,False,True
2,30.0,54000.0,No,False,True,False
3,38.0,61000.0,No,False,False,True
4,40.0,63777.777778,Yes,False,True,False
5,35.0,58000.0,Yes,True,False,False
6,,52000.0,No,False,False,True
7,48.0,79000.0,Yes,True,False,False
8,50.0,83000.0,No,False,True,False
9,37.0,67000.0,Yes,True,False,False


In [26]:
final.dtypes

Age          float64
Salary       float64
Purchased     object
France          bool
Germany         bool
Spain           bool
dtype: object

In [27]:
final['Purchased']=final['Purchased'].map(lambda x: 1 if x=='Yes' else 0)
final

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,0,True,False,False
1,27.0,48000.0,1,False,False,True
2,30.0,54000.0,0,False,True,False
3,38.0,61000.0,0,False,False,True
4,40.0,63777.777778,1,False,True,False
5,35.0,58000.0,1,True,False,False
6,,52000.0,0,False,False,True
7,48.0,79000.0,1,True,False,False
8,50.0,83000.0,0,False,True,False
9,37.0,67000.0,1,True,False,False


# Feature Scaling

In [28]:
from sklearn.preprocessing import MinMaxScaler
mapping = {True: 1, False: 0}
final['France']=final['France'].map(mapping)
final['Germany']=final['Germany'].map(mapping)
final['Spain']=final['Spain'].map(mapping)
final

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


# Train Test Splitting

In [31]:
from sklearn.model_selection import train_test_split
X=final.drop('Purchased',axis=1)
y=final['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)