# Data Preprocessing Template

## Importing the libraries

In [31]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
#from sklearn.compose import ColumnTransformer
#from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

## Importing the dataset

In [22]:
dataframe = pd.read_csv('data.csv')
print(dataframe)
X = dataframe.iloc[:,:-1].values
print(X)
y = dataframe.iloc[:,-1].values
print(y)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [8]:
from sklearn.impute import SimpleImputer

In [11]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')

In [20]:
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [27]:
## Data Transformation 

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
# from sklearn.preprocessing import FunctionTransformer, LabelEncoder  # Uncomment if using LabelEncoder workaround

# ------------------------------------------
# 1️⃣ Create a Sample Dataset
# ------------------------------------------
data = {
    'Country': ['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France', 'Spain', 'France', 'Germany', 'France'],
    'Age': [44.0, 27.0, 30.0, 38.0, 40.0, 35.0, np.nan, 48.0, 50.0, 37.0],
    'Salary': [72000.0, 48000.0, 54000.0, 61000.0, np.nan, 58000.0, 52000.0, 79000.0, 83000.0, 67000.0],
    'Purchased': ['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
}
df = pd.DataFrame(data)

# ------------------------------------------
# 2️⃣ Handle Missing Numerical Data
# ------------------------------------------
imputer = SimpleImputer(strategy='mean')
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])

# ------------------------------------------
# 3️⃣ Encode Categorical Data
# ------------------------------------------
ct = ColumnTransformer(
    transformers=[
        ('country_encoder', OneHotEncoder(), ['Country']),       # One-hot encoding for Country
        ('purchase_encoder', OrdinalEncoder(), ['Purchased'])    # Ordinal encoding for Purchased
    ],
    remainder='passthrough'  # Keep Age and Salary as is
)

encoded_array = ct.fit_transform(df)
np.set_printoptions(suppress=True, precision=1)
print(encoded_array)

[[    1.      0.      0.      0.     44.  72000. ]
 [    0.      0.      1.      1.     27.  48000. ]
 [    0.      1.      0.      0.     30.  54000. ]
 [    0.      0.      1.      0.     38.  61000. ]
 [    0.      1.      0.      1.     40.  63777.8]
 [    1.      0.      0.      1.     35.  58000. ]
 [    0.      0.      1.      0.     38.8 52000. ]
 [    1.      0.      0.      1.     48.  79000. ]
 [    0.      1.      0.      0.     50.  83000. ]
 [    1.      0.      0.      1.     37.  67000. ]]
