# Importing the Dataset

In [2]:
#Import library for reading the dataset  
import pandas as pd
import numpy as np

In [3]:
#Import the dataset and store it in a variable named dataset
dataset = pd.read_csv("Data.csv")

In [4]:
#Now we peek at the top of the datset. Notice that missing values are stored as NaN.
dataset.head

<bound method NDFrame.head of    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes>

In [5]:
# or else we can print the whole database
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Now we do a general check to see if there are any missing values and then 
count their number

In [14]:
dataset.isnull().any()

Country      False
Age           True
Salary        True
Purchased    False
dtype: bool

In [15]:
#Now this will count the number of missing values in each column
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

# Handling Missing Data

In [16]:
from sklearn.impute import SimpleImputer

In [17]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [18]:
# fit the values of nan for age and salary column
imputer = imputer.fit(dataset[['Age','Salary']])
imputer


SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [19]:
# transform the data, so that nan value can be replaced
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [20]:
#Now we check the dataset for the missing values.
dataset.isnull().any()

Country      False
Age           True
Salary        True
Purchased    False
dtype: bool

In [21]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Separating the independent and dependent (i.e. target) features

In [22]:
#Independent Features
X = dataset.iloc[:,0:3].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [23]:
# target
Y = dataset.iloc[0:-1].values
Y

array([['France', 44.0, 72000.0, 'No'],
       ['Spain', 27.0, 48000.0, 'Yes'],
       ['Germany', 30.0, 54000.0, 'No'],
       ['Spain', 38.0, 61000.0, 'No'],
       ['Germany', 40.0, nan, 'Yes'],
       ['France', 35.0, 58000.0, 'Yes'],
       ['Spain', nan, 52000.0, 'No'],
       ['France', 48.0, 79000.0, 'Yes'],
       ['Germany', 50.0, 83000.0, 'No']], dtype=object)

# Categorical Encoding

In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
#The 0th column of X has non-ordinal or nominal data
X[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object)

In [22]:
#first label encode 0th column in X
labelencoder_X = LabelEncoder()

# fit and transform
X[:,0] = labelencoder_X.fit_transform(X[:,0])
X[:,0]

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object)

In [24]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, nan],
       [0, 35.0, 58000.0],
       [2, nan, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [47]:
# again read the data as per the input
X = dataset.iloc[:,0:3].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [28]:
#Now we one-hot encode 0th column of X
onehotencoder = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [0])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)

In [36]:
# fit and transform the data to get the encoding done
X[:,0] = onehotencoder.fit_transform(X.tolist())

ValueError: could not broadcast input array from shape (10,5) into shape (10)

In [30]:
# before doing cateforical encoding just analyse the data Y
Y.shape

(9, 4)

In [31]:
# flatten the whole two dimentional list into an single dimentional list
Y.ravel().shape

(36,)

In [34]:
# Now categorical encoding the target variable
labelencoder_Y = LabelEncoder()

# fit and transform the target values
Y = labelencoder_Y.fit_transform(Y.ravel().shape)
Y

array([0])

# Splitting the data into test set and train set

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
# split the dataset into testing and training data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 4,random_state = 0)

ValueError: Found input variables with inconsistent numbers of samples: [10, 9]

In [None]:
X_train

In [None]:
X_test

In [None]:
Y_train

In [None]:
Y_test

In [None]:
len(X_train), len(X_test), len(Y_train), len(Y_test)

# Feature Scaling

Standardize features by removing the mean and scaling to unit variance

The standard score of a sample x is calculated as:

z = (x - u) / s

where u is the mean of the training samples and

s is the standard deviation of the training samples

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# initializing the standard scaler
sc_X = StandardScaler()

In [None]:
# fit and transform the x_train data


In [None]:
# transform the x_test data


In [None]:
#Inverse of scaling to get the original data for x_train


# Thank You