'''<br>
Author: Nikhil Patil<br>
Date: 08-11-2024<br>
Last Modified by: Nikhil Patil<br>
Last Modified time: 09-11-2024 <br>
Title : Python program to solved problem using Linear Regression<br>

'''

## **Import Libraries**

In [28]:
import numpy as np
import pandas as pd

In [29]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## **Read CSV Dataset**

In [30]:
df = pd.read_csv('data_preprocessing.csv')

In [31]:
display(df)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## **Handling missing data**

In [32]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)


#### **Using Imputing**

In [33]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])

In [34]:
display(df)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## **Handling Categorical Data**

#### **Encoding the 'Country' column**

In [35]:
le = LabelEncoder()
df['Country'] = le.fit_transform(df['Country'])

In [36]:
display(df)

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,No
1,2,27.0,48000.0,Yes
2,1,30.0,54000.0,No
3,2,38.0,61000.0,No
4,1,40.0,63777.777778,Yes
5,0,35.0,58000.0,Yes
6,2,38.777778,52000.0,No
7,0,48.0,79000.0,Yes
8,1,50.0,83000.0,No
9,0,37.0,67000.0,Yes


#### **Encoding the 'Purchased' column**

In [37]:
le = LabelEncoder()
df['Purchased'] = le.fit_transform(df['Purchased'])

In [38]:
display(df)

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,63777.777778,1
5,0,35.0,58000.0,1
6,2,38.777778,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [39]:
X = pd.concat([df, df[['Age', 'Salary']]], axis=1)
y = df['Purchased']

In [40]:
display(X)

Unnamed: 0,Country,Age,Salary,Purchased,Age.1,Salary.1
0,0,44.0,72000.0,0,44.0,72000.0
1,2,27.0,48000.0,1,27.0,48000.0
2,1,30.0,54000.0,0,30.0,54000.0
3,2,38.0,61000.0,0,38.0,61000.0
4,1,40.0,63777.777778,1,40.0,63777.777778
5,0,35.0,58000.0,1,35.0,58000.0
6,2,38.777778,52000.0,0,38.777778,52000.0
7,0,48.0,79000.0,1,48.0,79000.0
8,1,50.0,83000.0,0,50.0,83000.0
9,0,37.0,67000.0,1,37.0,67000.0


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Feature Scaling**

#### **Data Before Scaling**

In [42]:
display(X_train)
display(X_test)

Unnamed: 0,Country,Age,Salary,Purchased,Age.1,Salary.1
5,0,35.0,58000.0,1,35.0,58000.0
0,0,44.0,72000.0,0,44.0,72000.0
7,0,48.0,79000.0,1,48.0,79000.0
2,1,30.0,54000.0,0,30.0,54000.0
9,0,37.0,67000.0,1,37.0,67000.0
4,1,40.0,63777.777778,1,40.0,63777.777778
3,2,38.0,61000.0,0,38.0,61000.0
6,2,38.777778,52000.0,0,38.777778,52000.0


Unnamed: 0,Country,Age,Salary,Purchased,Age.1,Salary.1
8,1,50.0,83000.0,0,50.0,83000.0
1,2,27.0,48000.0,1,27.0,48000.0


#### **Applying Standard Scalar**

In [43]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Preprocessed training set shape:", X_train.shape)
print("Preprocessed test set shape:", X_test.shape)

Preprocessed training set shape: (8, 6)
Preprocessed test set shape: (2, 6)


#### **Data After Scaling**

In [44]:
display(X_train)
display(X_test)

array([[-0.90453403, -0.7529426 , -0.62603778,  1.        , -0.7529426 ,
        -0.62603778],
       [-0.90453403,  1.00845381,  1.01304295, -1.        ,  1.00845381,
         1.01304295],
       [-0.90453403,  1.79129666,  1.83258331,  1.        ,  1.79129666,
         1.83258331],
       [ 0.30151134, -1.73149616, -1.09434656, -1.        , -1.73149616,
        -1.09434656],
       [-0.90453403, -0.36152118,  0.42765698,  1.        , -0.36152118,
         0.42765698],
       [ 0.30151134,  0.22561096,  0.05040824,  1.        ,  0.22561096,
         0.05040824],
       [ 1.50755672, -0.16581046, -0.27480619, -1.        , -0.16581046,
        -0.27480619],
       [ 1.50755672, -0.01359102, -1.32850095, -1.        , -0.01359102,
        -1.32850095]])

array([[ 0.30151134,  2.18271808,  2.30089209, -1.        ,  2.18271808,
         2.30089209],
       [ 1.50755672, -2.3186283 , -1.79680973,  1.        , -2.3186283 ,
        -1.79680973]])