# Data preparation using Sklearn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Defining the column names based on the data description
cols = ['MPG', 'Cylinders', 'Displacements', 'Horsepower', 'Weight', 
        'Acceleration', 'Model Year', 'Origin']
# Reading the .data file using pandas
df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                 comment = '\t', sep =" ", skipinitialspace=True)

# Making copy of the dataframe
data = df.copy()
data.head()

Unnamed: 0,MPG,Cylinders,Displacements,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [3]:
# Stratified split of the data
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

In [4]:
# Separating the target variable from the data
data = train_set.drop("MPG", axis=1)
data_labels = train_set["MPG"].copy()



### Preprocessing the Origin Column



In [5]:
# function to map the numbers in the origin column to the country
def preprocess_origin_col(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA",3: "Germany"})
    return df
data_tr = preprocess_origin_col(data)
data_tr.head()



Unnamed: 0,Cylinders,Displacements,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA


In [6]:
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Cylinders      318 non-null    int64  
 1   Displacements  318 non-null    float64
 2   Horsepower     314 non-null    float64
 3   Weight         318 non-null    float64
 4   Acceleration   318 non-null    float64
 5   Model Year     318 non-null    int64  
 6   Origin         318 non-null    object 
dtypes: float64(4), int64(2), object(1)
memory usage: 19.9+ KB


In [7]:
data_cat = data_tr[['Origin','Cylinders']]
data_cat.head()

Unnamed: 0,Origin,Cylinders
145,Germany,4
151,USA,4
388,India,4
48,India,6
114,USA,4


In [10]:
#One hot encoding categorical values
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot

<318x8 sparse matrix of type '<class 'numpy.float64'>'
	with 636 stored elements in Compressed Sparse Row format>

### Missing values

In [11]:
# Segregating numerical data
num_data = data.iloc[:,:-1]
num_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Cylinders      318 non-null    int64  
 1   Displacements  318 non-null    float64
 2   Horsepower     314 non-null    float64
 3   Weight         318 non-null    float64
 4   Acceleration   318 non-null    float64
 5   Model Year     318 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 17.4 KB


In [16]:
# Getting the missing values in Horsepower using the media
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(num_data)

# Filing the missing values
X = imputer.transform(num_data)

# Converting the array back to a dataframe
data_tr = pd.DataFrame(X, columns = num_data.columns, index = num_data.index)

data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Cylinders      318 non-null    float64
 1   Displacements  318 non-null    float64
 2   Horsepower     318 non-null    float64
 3   Weight         318 non-null    float64
 4   Acceleration   318 non-null    float64
 5   Model Year     318 non-null    float64
dtypes: float64(6)
memory usage: 17.4 KB


### Adding new atributes 

In [17]:
num_data.head()

Unnamed: 0,Cylinders,Displacements,Horsepower,Weight,Acceleration,Model Year
145,4,83.0,61.0,2003.0,19.0,74
151,4,79.0,67.0,2000.0,16.0,74
388,4,156.0,92.0,2585.0,14.5,82
48,6,250.0,88.0,3139.0,14.5,71
114,4,98.0,90.0,2265.0,15.5,73


In [43]:
from sklearn.base import BaseEstimator, TransformerMixin

# Index of the colums given their position in the dataframe
acceleration_ind = 4
horsepower_ind = 2
cylinders_ind = 0

#Attributes adder
class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acceleration_on_horsepower = True):
        self.acceleration_on_horsepower = acceleration_on_horsepower
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        acceleration_on_cylinders = X[:, acceleration_ind] / X[:, cylinders_ind]
        if self.acceleration_on_horsepower:
            acceleration_on_horsepower = X[:, acceleration_ind] / X[:, horsepower_ind]
            #np.c_ method concatenatte the arrays
            return np.c_[X, acceleration_on_horsepower, acceleration_on_cylinders]
        
        return np.c_[X, acceleration_on_cylinders]

attr_adder = CustomAttrAdder(acceleration_on_horsepower = True)
data_tr_extra_attrs = attr_adder.transform(data_tr.values)
data_tr_extra_attrs[0]
    
    


array([4.0000000e+00, 8.3000000e+01, 6.1000000e+01, 2.0030000e+03,
       1.9000000e+01, 7.4000000e+01, 3.1147541e-01, 4.7500000e+00])

# Creating Pipeline

In [44]:
from sklearn.pipeline import Pipeline
## Scaling numerical atributes
from sklearn.preprocessing import StandardScaler

numerics = ['float64', 'int64']

num_data = data_tr.select_dtypes(include = numerics)

## Pipeline for numerical attributes
# Impute missing values
# Add atributes
# Scale the data
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attrs_adder', CustomAttrAdder()),
    ('std_scaler', StandardScaler())
])

num_data_tr = num_pipeline.fit_transform(num_data)
num_data_tr[0]


array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

## Transforming numerical and categorical

In [46]:
from sklearn.compose import ColumnTransformer
num_attrs = list(num_data)
cat_attrs = ['Origin']

##complete pipeline to transform
##both numerical and categorical attributes

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attrs),
    ('cat', OneHotEncoder(), cat_attrs),
])

prepared_data = full_pipeline.fit_transform(data)
prepared_data[0]


array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])