In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns 
sns.set_style('darkgrid')


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import GradientBoostingRegressor


In [25]:
data= pd.read_csv('Advertising.csv')

In [26]:
data

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
...,...,...,...,...,...
195,196,38.2,3.7,13.8,7.6
196,197,94.2,4.9,8.1,9.7
197,198,177.0,9.3,6.4,12.8
198,199,283.6,42.0,66.2,25.5


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  200 non-null    int64  
 1   TV          200 non-null    float64
 2   radio       200 non-null    float64
 3   newspaper   200 non-null    float64
 4   sales       200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


In [37]:
def preprocess_inputs(df):
    df=df.copy()
    
    missing_target_rows=df[df['sales'].isna()].index
    df= df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    for column in ['TV','radio','newspaper']:
        df[column]= df[column].fillna(df[column].mean())
                                                        
    y=df['sales']
    x=df.drop('sales',axis=1)
    
    x_train,x_test,y_train,y_test=train_test_split(x,y, train_size=0.7, shuffle=True, random_state=1)
    
                                                        
    return x_train,x_test,y_train,y_test

In [41]:
x_train,x_test,y_train,y_test= preprocess_inputs(data)
x_train

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper
116,117,139.2,14.3,25.6
67,68,139.3,14.5,10.2
78,79,5.4,29.9,9.4
42,43,293.6,27.7,1.8
17,18,281.4,39.6,55.8
...,...,...,...,...
133,134,219.8,33.5,45.1
137,138,273.7,28.9,59.7
72,73,26.8,33.0,19.3
140,141,73.4,17.0,12.9


In [42]:
y_train

116    12.2
67     13.4
78      5.3
42     20.7
17     24.4
       ... 
133    19.6
137    20.8
72      8.8
140    10.9
37     14.7
Name: sales, Length: 140, dtype: float64

In [43]:
x_train.isna().sum()

Unnamed: 0    0
TV            0
radio         0
newspaper     0
dtype: int64

In [50]:
x_train['radio'].isna().sum()

0

In [53]:
x_train['radio'].isna().index

Int64Index([116,  67,  78,  42,  17,   5, 127, 105,  48,  66,
            ...
             71, 129, 144, 192,  79, 133, 137,  72, 140,  37],
           dtype='int64', length=140)

In [57]:
x_test

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper
58,59,210.8,49.6,37.7
40,41,202.5,22.3,31.6
34,35,95.7,1.4,7.4
102,103,280.2,10.1,21.4
184,185,253.8,21.3,30.0
198,199,283.6,42.0,66.2
95,96,163.3,31.6,52.9
4,5,180.8,10.8,58.4
29,30,70.6,16.0,40.8
168,169,215.4,23.6,57.6


# Training 

In [54]:
nominal_transformer= Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False))
])

preprocessor= ColumnTransformer(transformers=[
    ('nominal', nominal_transformer, ['newspaper'])
], remainder='passthrough')

model = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',GradientBoostingRegressor())
])

In [55]:
model.fit(x_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('nominal',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['newspaper'])])),
                ('regressor', GradientBoostingRegressor())])

# Results

In [74]:
y_pred = model.predict(x_test)

rmse=np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1-np.sum((y_test-y_pred)**2) / np.sum((y_test-y_test.mean())**2)

print("RMSE: {:.2f}",format(rmse))
print("R^2: {:.4f}",format(r2))

plt.figure(figsize=(10,10))
sns.scatterplot(x=y_pred,y=y_test)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Actual vs. Predicted values")
plt.show()


ValueError: Found unknown categories [0.3, 3.6, 4.0, 10.9, 12.4, 17.0, 17.6, 18.5, 19.1, 19.6, 20.3, 20.5, 22.0, 22.9, 23.4, 27.2, 27.4, 31.3, 32.0, 34.5, 35.1, 35.2, 38.6, 40.8, 41.4, 43.3, 46.0, 47.4, 51.2, 51.4, 56.5, 57.6, 58.4, 58.7, 66.2, 74.2, 79.2, 84.8, 114.0] in column 0 during transform

In [59]:
np.array(y_test)

array([23.8, 16.6,  9.5, 14.8, 17.6, 25.5, 16.9, 12.9, 10.5, 17.1, 14.5,
       11.3, 17.4, 16.7, 13.4, 15.9, 12.9, 12.8,  9.5, 18.4, 10.7, 12.5,
        8.5, 11.5, 11.9, 14.9, 10.1, 18.9, 19.6, 15.9, 23.2, 11.9, 17.3,
       11.7, 20.2, 15.5, 11.5, 11. , 22.3,  7.6,  5.3,  8.7,  6.7, 19. ,
        5.5, 14.6, 14.6, 21.5, 22.6, 19.7, 25.4, 15.2,  6.6, 21.2, 17.4,
       12.6, 12.2,  7.2, 13.4,  9.6])

In [60]:
y_pred

NameError: name 'y_pred' is not defined

In [64]:
rmse=np.sqrt(np.mean((y_test - y_pred)**2))

NameError: name 'y_pred' is not defined

In [66]:
np.sum((y_test-y_test.mean())**2)

1492.6018333333332

In [69]:
r2 = 1-np.sum((y_test-y_pred)**2) / np.sum((y_test-y_test.mean())**2)

NameError: name 'y_pred' is not defined