# Week 7 - Predicting Numerical Values

In [3]:
import pandas as pd 

## Load the Cleaned Data

This data is just numerical, including the target.

In [4]:
data_df = pd.read_csv('./data/clean_df_small.csv')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x0      1000 non-null   float64
 1   x1      1000 non-null   float64
 2   x2      1000 non-null   float64
 3   x3      1000 non-null   float64
 4   x4      1000 non-null   float64
 5   x5      1000 non-null   float64
 6   x6      1000 non-null   float64
 7   x7      1000 non-null   float64
 8   y       1000 non-null   int64  
dtypes: float64(8), int64(1)
memory usage: 70.4 KB


## Set the predictors (X) and the target (y). 

In [5]:
predictors = ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']
target = ['y']
X = data_df[predictors]
y = data_df[target]

print('X:', X, '\n')
print('y:', y)

X:             x0         x1         x2        x3         x4         x5  \
0     0.198560  74.425320  67.627745 -3.095111  -6.822327  19.048071   
1   -29.662621  24.320711 -48.205182  1.430339  -6.552206   4.263074   
2    15.493759 -66.160459  50.512903 -2.265792  14.428578   2.509323   
3   -19.837651  33.210943  53.405563  1.079462  11.364251  -1.064581   
4    11.896655 -26.717872 -17.758176  1.692017  21.553537  -5.852097   
..         ...        ...        ...       ...        ...        ...   
995  10.168917 -53.467000   5.798938 -2.070763   6.978812  -9.610800   
996  10.888354 -16.407455 -18.555127 -1.471866   3.357271  17.756201   
997   4.194442  29.487012 -24.583970  1.216510   4.300679 -20.482651   
998  -5.593978 -51.214137 -30.648869 -0.456163  -6.769102 -14.417841   
999  30.962264 -37.445070 -23.288074  1.975351   3.268378 -22.832079   

            x6         x7  
0    -0.362378 -10.699174  
1     6.551412   4.265483  
2    -6.707536   3.820842  
3     9.308857   9.2

## Separate the Data

This sets the test and train data. Not the best way, but good enough for baseline modeling.

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)

## Fit the Model

This is also known as training the model. I set K arbitrarily. I also set a timer to see how long training takes.

In [7]:
from sklearn.neighbors import KNeighborsRegressor
import time

knn_algo = KNeighborsRegressor(n_neighbors=3)

start = time.time()
knn_algo.fit(X_train, y_train)
end = time.time()

print("Elapsed time is  {}".format(end-start))

Elapsed time is  0.0030291080474853516


## Test the Model

Send unseen samples taken from the original data (i.e., the test data) during the test/train split to the model.

In [8]:
y_pred_knn = knn_algo.predict(X_test)

## Combine the Data

This is a "bird's eye" view of the predictions compared to the actual values.

In [9]:
y_test['y_pred'] = y_pred_knn
y_test['y_pred'] = y_test['y_pred'].apply(lambda x: round(x, 0))

output_df = pd.concat([X_test, y_test], axis=1)

print(output_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 993 to 695
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x0      250 non-null    float64
 1   x1      250 non-null    float64
 2   x2      250 non-null    float64
 3   x3      250 non-null    float64
 4   x4      250 non-null    float64
 5   x5      250 non-null    float64
 6   x6      250 non-null    float64
 7   x7      250 non-null    float64
 8   y       250 non-null    int64  
 9   y_pred  250 non-null    float64
dtypes: float64(9), int64(1)
memory usage: 21.5 KB
None


## Save the Output

In [10]:
output_df.to_csv('./data/output_df.csv', index=False)

## Calculate the Mean Squared Error

In [11]:
from sklearn.metrics import mean_squared_error
print('Mean squared error: ', round(mean_squared_error(y_test['y'], y_pred_knn), 2))

Mean squared error:  1.87
