In [1]:
import numpy as np
import pandas as pd
from random import randint

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt


Create an 10x8 dataframe with random scores from 0 to 5 

In [2]:
ratings = {
    "col1": [randint(0,5) for i in range(10)],
    "col2": [randint(0,5) for i in range(10)],
    "col3": [randint(0,5) for i in range(10)],
    "col4": [randint(0,5) for i in range(10)],
    "col5": [randint(0,5) for i in range(10)],
    "col6": [randint(0,5) for i in range(10)],
    "col7": [randint(0,5) for i in range(10)],
    "col8": [randint(0,5) for i in range(10)],
}

In [44]:
df = pd.DataFrame(ratings, header=True)
df

TypeError: __init__() got an unexpected keyword argument 'header'

In order to add some null values, we choose at randome about 15% to be NaN

In [4]:
np.random.seed(111)
mask = np.random.choice([True, False], size=df.shape, p=[0.15, 0.85])
df = df.mask(mask)    

In [5]:
df

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
0,5.0,3.0,0.0,1.0,2.0,,,4.0
1,2.0,2.0,3.0,1.0,,4.0,0.0,2.0
2,5.0,,,1.0,2.0,1.0,1.0,5.0
3,0.0,5.0,2.0,,5.0,,5.0,3.0
4,0.0,0.0,4.0,,3.0,2.0,1.0,2.0
5,3.0,3.0,0.0,2.0,3.0,4.0,,1.0
6,0.0,1.0,4.0,0.0,5.0,5.0,,
7,,,,1.0,3.0,3.0,3.0,
8,1.0,1.0,4.0,0.0,0.0,1.0,5.0,3.0
9,,4.0,5.0,3.0,1.0,0.0,1.0,4.0


From here the dataframe is split into training and test sets on an 80/20 basis.

In [6]:
train, test = train_test_split(df, test_size=0.2)

The next step is to calculate the raw averages of the training and test sets. First we will work on the training set.

In [7]:
train

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
0,5.0,3.0,0.0,1.0,2.0,,,4.0
7,,,,1.0,3.0,3.0,3.0,
4,0.0,0.0,4.0,,3.0,2.0,1.0,2.0
1,2.0,2.0,3.0,1.0,,4.0,0.0,2.0
3,0.0,5.0,2.0,,5.0,,5.0,3.0
2,5.0,,,1.0,2.0,1.0,1.0,5.0
5,3.0,3.0,0.0,2.0,3.0,4.0,,1.0
6,0.0,1.0,4.0,0.0,5.0,5.0,,


In [8]:
# sum the rows, them sum the row sums
# then divide the total by the non-NaN count to obtain the raw avg
train_row_sum = train.sum(axis=1)
df_train_sum = train_row_sum.sum()
train_raw_avg = df_train_sum / train.count().sum()
train_raw_avg

2.3877551020408165

The raw average for the training set is output above. Next we will impute (replace) any NaN values in the training set with this number

In [9]:
train_imputed = train.fillna(train_raw_avg)
train_imputed

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
0,5.0,3.0,0.0,1.0,2.0,2.387755,2.387755,4.0
7,2.387755,2.387755,2.387755,1.0,3.0,3.0,3.0,2.387755
4,0.0,0.0,4.0,2.387755,3.0,2.0,1.0,2.0
1,2.0,2.0,3.0,1.0,2.387755,4.0,0.0,2.0
3,0.0,5.0,2.0,2.387755,5.0,2.387755,5.0,3.0
2,5.0,2.387755,2.387755,1.0,2.0,1.0,1.0,5.0
5,3.0,3.0,0.0,2.0,3.0,4.0,2.387755,1.0
6,0.0,1.0,4.0,0.0,5.0,5.0,2.387755,2.387755


Lastly an equivalent sized dataframe is created full of raw averages in order to calculate the root mean square error (RMSE).

In [10]:
train_avg = {
    "col1": [train_raw_avg for i in range(8)],
    "col2": [train_raw_avg for i in range(8)],
    "col3": [train_raw_avg for i in range(8)],
    "col4": [train_raw_avg for i in range(8)],
    "col5": [train_raw_avg for i in range(8)],
    "col6": [train_raw_avg for i in range(8)],
    "col7": [train_raw_avg for i in range(8)],
    "col8": [train_raw_avg for i in range(8)],
}

train_raw_avg_df = pd.DataFrame(train_avg)
train_raw_avg_df

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
0,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755
1,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755
2,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755
3,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755
4,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755
5,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755
6,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755
7,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755,2.387755


Lastly the RMSE for the training set is output below:

In [12]:
training_rmse = sqrt(mean_squared_error(train_imputed, train_raw_avg_df))
training_rmse

1.4557679087277726

Now let's go to work on the test set...

In [13]:
test

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
9,,4.0,5.0,3.0,1.0,0.0,1.0,4.0
8,1.0,1.0,4.0,0.0,0.0,1.0,5.0,3.0


In [38]:
# my method for finding the raw avgs. Sum the rows, then sum the column
test_row_sum = test.sum(axis=1)
df_test_sum = test_row_sum.sum()
test_raw_avg = df_test_sum / test.count().sum()
test_raw_avg

2.2

As we did with the training set, we impute the raw average over NaNs in the test set

In [15]:
test_imputed = test.fillna(test_raw_avg)
test_imputed

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
9,2.2,4.0,5.0,3.0,1.0,0.0,1.0,4.0
8,1.0,1.0,4.0,0.0,0.0,1.0,5.0,3.0


Second to last, we create a new dataframe with the raw averages of the test set in order to calcluate the RMSE

In [16]:
test_avg = {
    "col1": [test_raw_avg for i in range(2)],
    "col2": [test_raw_avg for i in range(2)],
    "col3": [test_raw_avg for i in range(2)],
    "col4": [test_raw_avg for i in range(2)],
    "col5": [test_raw_avg for i in range(2)],
    "col6": [test_raw_avg for i in range(2)],
    "col7": [test_raw_avg for i in range(2)],
    "col8": [test_raw_avg for i in range(2)],
}

test_raw_avg_df = pd.DataFrame(test_avg)
test_raw_avg_df

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
0,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2
1,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.2


Lastly the RMSE is calculated when we subtract the imputed dataframe from the raw average and is outputted below.

In [17]:
test_rmse = sqrt(mean_squared_error(test_imputed, test_raw_avg_df))
test_rmse

1.7392527130926085

In [25]:
train

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
0,5.0,3.0,0.0,1.0,2.0,,,4.0
7,,,,1.0,3.0,3.0,3.0,
4,0.0,0.0,4.0,,3.0,2.0,1.0,2.0
1,2.0,2.0,3.0,1.0,,4.0,0.0,2.0
3,0.0,5.0,2.0,,5.0,,5.0,3.0
2,5.0,,,1.0,2.0,1.0,1.0,5.0
5,3.0,3.0,0.0,2.0,3.0,4.0,,1.0
6,0.0,1.0,4.0,0.0,5.0,5.0,,


In [29]:
print(train.sum(axis=0), train.count(axis=0))

col1    15.0
col2    14.0
col3    13.0
col4     6.0
col5    23.0
col6    19.0
col7    10.0
col8    17.0
dtype: float64 col1    7
col2    6
col3    6
col4    6
col5    7
col6    6
col7    5
col8    6
dtype: int64


In [68]:
#def get_bias_values(train_df, mean):
    
train_col_bias = (train.sum(axis=0) / train.count(axis=0)) - train_raw_avg
train_col_bias

col1   -0.244898
col2   -0.054422
col3   -0.221088
col4   -1.387755
col5    0.897959
col6    0.778912
col7   -0.387755
col8    0.445578
dtype: float64

In [32]:
train_row_bias = (train.sum(axis=1) / train.count(axis=1)) - test_raw_avg
train_row_bias

0    0.300000
7    0.300000
4   -0.485714
1   -0.200000
3    1.133333
2    0.300000
5    0.085714
6    0.300000
dtype: float64

In [65]:
baseline_train = {
    "col1":[(train_col_bias[0] + i + train_raw_avg) for i in train_row_bias],
    "col2":[(train_col_bias[1] + i + train_raw_avg) for i in train_row_bias],
    "col3":[(train_col_bias[2] + i + train_raw_avg) for i in train_row_bias],
    "col4":[(train_col_bias[3] + i + train_raw_avg) for i in train_row_bias],
    "col5":[(train_col_bias[4] + i + train_raw_avg) for i in train_row_bias],
    "col6":[(train_col_bias[5] + i + train_raw_avg) for i in train_row_bias],
    "col7":[(train_col_bias[6] + i + train_raw_avg) for i in train_row_bias],
    "col8":[(train_col_bias[7] + i + train_raw_avg) for i in train_row_bias],
}

baseline_train_df = pd.DataFrame(baseline_train)
baseline_train_df

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
0,2.442857,2.633333,2.466667,1.3,3.585714,3.466667,2.3,3.133333
1,2.442857,2.633333,2.466667,1.3,3.585714,3.466667,2.3,3.133333
2,1.657143,1.847619,1.680952,0.514286,2.8,2.680952,1.514286,2.347619
3,1.942857,2.133333,1.966667,0.8,3.085714,2.966667,1.8,2.633333
4,3.27619,3.466667,3.3,2.133333,4.419048,4.3,3.133333,3.966667
5,2.442857,2.633333,2.466667,1.3,3.585714,3.466667,2.3,3.133333
6,2.228571,2.419048,2.252381,1.085714,3.371429,3.252381,2.085714,2.919048
7,2.442857,2.633333,2.466667,1.3,3.585714,3.466667,2.3,3.133333


In [67]:
baseline_train_rmse = sqrt(mean_squared_error(baseline_train_df, train_raw_avg_df))
baseline_train_rmse

0.8405465961035141

In [69]:
test_col_bias = (test.sum(axis=0) / test.count(axis=0)) - test_raw_avg
test_col_bias

col1   -1.2
col2    0.3
col3    2.3
col4   -0.7
col5   -1.7
col6   -1.7
col7    0.8
col8    1.3
dtype: float64

In [70]:
test_row_bias = (test.sum(axis=1) / test.count(axis=1) - test_raw_avg)
test_row_bias

9    0.371429
8   -0.325000
dtype: float64

In [74]:
baseline_test = {
    "col1":[(test_col_bias[0] + i + test_raw_avg) for i in test_row_bias],
    "col2":[(test_col_bias[1] + i + test_raw_avg) for i in test_row_bias],
    "col3":[(test_col_bias[2] + i + test_raw_avg) for i in test_row_bias],
    "col4":[(test_col_bias[3] + i + test_raw_avg) for i in test_row_bias],
    "col5":[(test_col_bias[4] + i + test_raw_avg) for i in test_row_bias],
    "col6":[(test_col_bias[5] + i + test_raw_avg) for i in test_row_bias],
    "col7":[(test_col_bias[6] + i + test_raw_avg) for i in test_row_bias],
    "col8":[(test_col_bias[7] + i + test_raw_avg) for i in test_row_bias],
}


baseline_test_df = pd.DataFrame(baseline_test)
baseline_test_df

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8
0,1.371429,2.871429,4.871429,1.871429,0.871429,0.871429,3.371429,3.871429
1,0.675,2.175,4.175,1.175,0.175,0.175,2.675,3.175


In [75]:
baseline_test_rmse = sqrt(mean_squared_error(baseline_test_df, test_raw_avg_df))
baseline_test_rmse

1.4303181285922346

In [89]:
print(f"Original training set RMSE: {training_rmse}",
     f"\nBaseline predictors training set RMSE: {baseline_train_rmse}")

Original training set RMSE: 1.4557679087277726 
Baseline predictors training set RMSE: 0.8405465961035141


In [90]:
print(f"Original test set RMSE: {test_rmse}",
     f"\nBaseline predictors test set RMSE: {baseline_test_rmse}")

Original test set RMSE: 1.7392527130926085 
Baseline predictors test set RMSE: 1.4303181285922346
