In [1]:


import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [3]:


df = pd.read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df.isna().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('charges', axis = 1), df['charges'],
                                                    test_size = 0.2, random_state = 0)

In [10]:


trf1 = ColumnTransformer(transformers =[
    ('cat', SimpleImputer(strategy ='most_frequent'), ['sex', 'smoker', 'region']),
    ('num', SimpleImputer(strategy ='median'), ['age', 'bmi', 'children']),

], remainder ='passthrough')

In [11]:
first_step = trf1.fit_transform(X_train)
first_step

array([['male', 'yes', 'southwest', 37.0, 34.1, 4.0],
       ['male', 'no', 'southeast', 18.0, 34.43, 0.0],
       ['female', 'yes', 'northeast', 23.0, 36.67, 2.0],
       ...,
       ['male', 'no', 'southeast', 40.0, 25.08, 0.0],
       ['male', 'no', 'northwest', 19.0, 35.53, 0.0],
       ['female', 'no', 'southwest', 33.0, 18.5, 1.0]], dtype=object)

In [12]:
pd.DataFrame(first_step).head()

Unnamed: 0,0,1,2,3,4,5
0,male,yes,southwest,37.0,34.1,4.0
1,male,no,southeast,18.0,34.43,0.0
2,female,yes,northeast,23.0,36.67,2.0
3,male,no,southwest,32.0,35.2,2.0
4,female,no,northeast,58.0,32.395,1.0


In [13]:
pd.DataFrame(first_step).isna().sum()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
5,0


In [14]:
trf1.named_transformers_

{'cat': SimpleImputer(strategy='most_frequent'),
 'num': SimpleImputer(strategy='median')}

In [15]:


trf2 = ColumnTransformer(transformers =[
    ('enc', OneHotEncoder(sparse = False, drop ='first'), list(range(3))),
], remainder ='passthrough')

In [16]:


second_step = trf2.fit_transform(first_step)
pd.DataFrame(second_step).head()



Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,1.0,0.0,0.0,1.0,37.0,34.1,4.0
1,1.0,0.0,0.0,1.0,0.0,18.0,34.43,0.0
2,0.0,1.0,0.0,0.0,0.0,23.0,36.67,2.0
3,1.0,0.0,0.0,0.0,1.0,32.0,35.2,2.0
4,0.0,0.0,0.0,0.0,0.0,58.0,32.395,1.0


In [17]:


pipe = Pipeline(steps =[
    ('tf1', trf1),
    ('tf2', trf2),
    ('tf3', MinMaxScaler()), # or StandardScaler, or any other scaler
    ('model', RandomForestRegressor(n_estimators = 200)),
# or LinearRegression, SVR, DecisionTreeRegressor, etc
])

In [18]:
pipe.fit(X_train, y_train)




In [19]:
preds = pipe.predict(X_test)


In [20]:
pd.DataFrame({'original test set':y_test, 'predictions': preds})

Unnamed: 0,original test set,predictions
578,9724.53000,10520.937169
610,8547.69130,9488.320871
569,45702.02235,44628.971640
1034,12950.07120,13251.578393
198,9644.25250,10167.142421
...,...,...
1084,15019.76005,16438.845298
726,6664.68595,6590.712720
1132,20709.02034,11335.169687
725,40932.42950,42981.284276
