In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.read_csv(r"Data.csv") 
print("Dataset Head:\n", df.head())

Dataset Head:
    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes


In [4]:
##step 3

In [5]:
df_dropna = df.dropna()
print("After dropping missing values:\n", df_dropna)

After dropping missing values:
    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
5   France  35.0  58000.0       Yes
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]   

In [7]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [8]:
le = LabelEncoder()
y = le.fit_transform(y)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
print("X_train:\n", X_train)

X_train:
 [[0.0e+00 0.0e+00 1.0e+00     nan 5.2e+04]
 [0.0e+00 1.0e+00 0.0e+00 4.0e+01     nan]
 [1.0e+00 0.0e+00 0.0e+00 4.4e+01 7.2e+04]
 [0.0e+00 0.0e+00 1.0e+00 3.8e+01 6.1e+04]
 [0.0e+00 0.0e+00 1.0e+00 2.7e+01 4.8e+04]
 [1.0e+00 0.0e+00 0.0e+00 4.8e+01 7.9e+04]
 [0.0e+00 1.0e+00 0.0e+00 5.0e+01 8.3e+04]
 [1.0e+00 0.0e+00 0.0e+00 3.5e+01 5.8e+04]]


In [11]:
print("X_test:\n", X_test)

X_test:
 [[0.0e+00 1.0e+00 0.0e+00 3.0e+01 5.4e+04]
 [1.0e+00 0.0e+00 0.0e+00 3.7e+01 6.7e+04]]


In [12]:
print("y_train:\n", y_train)

y_train:
 [0 1 0 0 1 1 0 1]


In [13]:
print("y_test:\n", y_test)

y_test:
 [0 1]


In [15]:
mm_scaler = MinMaxScaler()
X_train_mm = X_train.copy()
X_test_mm = X_test.copy()
X_train_mm[:, -2:] = mm_scaler.fit_transform(X_train_mm[:, -2:])
X_test_mm[:, -2:] = mm_scaler.transform(X_test_mm[:, -2:])
print("X_train after MinMax Scaling:\n", X_train_mm)

X_train after MinMax Scaling:
 [[0.         0.         1.                nan 0.11428571]
 [0.         1.         0.         0.56521739        nan]
 [1.         0.         0.         0.73913043 0.68571429]
 [0.         0.         1.         0.47826087 0.37142857]
 [0.         0.         1.         0.         0.        ]
 [1.         0.         0.         0.91304348 0.88571429]
 [0.         1.         0.         1.         1.        ]
 [1.         0.         0.         0.34782609 0.28571429]]


In [16]:
std_scaler = StandardScaler()
X_train_std = X_train.copy()
X_test_std = X_test.copy()
X_train_std[:, -2:] = std_scaler.fit_transform(X_train_std[:, -2:])
X_test_std[:, -2:] = std_scaler.transform(X_test_std[:, -2:])
print("X_train after Standard Scaling:\n", X_train_std)

X_train after Standard Scaling:
 [[ 0.          0.          1.                 nan -1.018224  ]
 [ 0.          1.          0.         -0.03891021         nan]
 [ 1.          0.          0.          0.50583275  0.58347667]
 [ 0.          0.          1.         -0.31128169 -0.2974587 ]
 [ 0.          0.          1.         -1.80932482 -1.33856413]
 [ 1.          0.          0.          1.0505757   1.1440719 ]
 [ 0.          1.          0.          1.32294718  1.46441204]
 [ 1.          0.          0.         -0.71983891 -0.5377138 ]]


In [17]:
print("Final Preprocessed Data:")
print("Training Data (MinMax Scaled):\n", X_train_mm)
print("Training Data (Standard Scaled):\n", X_train_std)

Final Preprocessed Data:
Training Data (MinMax Scaled):
 [[0.         0.         1.                nan 0.11428571]
 [0.         1.         0.         0.56521739        nan]
 [1.         0.         0.         0.73913043 0.68571429]
 [0.         0.         1.         0.47826087 0.37142857]
 [0.         0.         1.         0.         0.        ]
 [1.         0.         0.         0.91304348 0.88571429]
 [0.         1.         0.         1.         1.        ]
 [1.         0.         0.         0.34782609 0.28571429]]
Training Data (Standard Scaled):
 [[ 0.          0.          1.                 nan -1.018224  ]
 [ 0.          1.          0.         -0.03891021         nan]
 [ 1.          0.          0.          0.50583275  0.58347667]
 [ 0.          0.          1.         -0.31128169 -0.2974587 ]
 [ 0.          0.          1.         -1.80932482 -1.33856413]
 [ 1.          0.          0.          1.0505757   1.1440719 ]
 [ 0.          1.          0.          1.32294718  1.46441204]
 [ 

In [20]:
pd.DataFrame(X_train_mm).to_csv('X_train_minmax.csv', index=False)


In [21]:
pd.DataFrame(X_test_mm).to_csv('X_test_minmax.csv', index=False)

In [23]:
pd.DataFrame(X_train_std).to_csv('X_train_standard.csv', index=False)
pd.DataFrame(X_test_std).to_csv('X_test_standard.csv', index=False)

In [1]:
print("Preprocessed data saved to CSV files named final.")

Preprocessed data saved to CSV files named final.
