# Week 7 - Predicting Numerical Values

In [11]:
import pandas as pd 

## Load the Cleaned Data

This data is just numerical, including the target.

In [12]:
data_df = pd.read_csv('./data/clean_large_df.csv')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x0      40000 non-null  float64
 1   x1      40000 non-null  float64
 2   x2      40000 non-null  float64
 3   x3      40000 non-null  float64
 4   x4      40000 non-null  float64
 5   x5      40000 non-null  float64
 6   x6      40000 non-null  float64
 7   x7      40000 non-null  float64
 8   y       40000 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 2.7 MB


## Set the predictors (X) and the target (y). 

In [13]:
predictors = ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']
target = ['y']
X = data_df[predictors]
y = data_df[target]

print('X:', X, '\n')
print('y:', y)

X:               x0         x1         x2        x3         x4         x5  \
0       0.198560  74.425320  67.627745 -3.095111  -6.822327  19.048071   
1     -29.662621  24.320711 -48.205182  1.430339  -6.552206   4.263074   
2      15.493759 -66.160459  50.512903 -2.265792  14.428578   2.509323   
3     -19.837651  33.210943  53.405563  1.079462  11.364251  -1.064581   
4      11.896655 -26.717872 -17.758176  1.692017  21.553537  -5.852097   
...          ...        ...        ...       ...        ...        ...   
39995  -8.011289  63.861763  18.135762 -0.328773 -10.857936  14.997781   
39996  21.769395 -88.026423 -27.868860 -3.490932  -9.440718  -1.461733   
39997  18.096631  64.294604  -6.942556  1.563421   2.903092  -6.435025   
39998   3.457479 -29.455176 -22.391388  0.667397 -32.301045   9.979818   
39999  -7.258538  13.095298 -18.692615  2.321107  20.899080  10.305206   

              x6         x7  
0      -0.362378 -10.699174  
1       6.551412   4.265483  
2      -6.707536  

## Separate the Data

This sets the test and train data. Not the best way, but good enough for baseline modeling.

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)

## Fit the Model

This is also known as training the model. I set K arbitrarily. I also set a timer to see how long training takes.

In [15]:
from sklearn.neighbors import KNeighborsRegressor
import time

knn_algo = KNeighborsRegressor(n_neighbors=3)

start = time.time()
knn_algo.fit(X_train, y_train)
end = time.time()

print("Elapsed time is  {}".format(end-start))

Elapsed time is  0.08078193664550781


## Test the Model

Send unseen samples taken from the original data (i.e., the test data) during the test/train split to the model.

In [16]:
y_pred_knn = knn_algo.predict(X_test)

## Combine the Data

This is a "bird's eye" view of the predictions compared to the actual values.

In [17]:
y_test['y_pred'] = y_pred_knn
y_test['y_pred'] = y_test['y_pred'].apply(lambda x: round(x, 0))

output_df = pd.concat([X_test, y_test], axis=1)

print(output_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 12836 to 5315
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x0      10000 non-null  float64
 1   x1      10000 non-null  float64
 2   x2      10000 non-null  float64
 3   x3      10000 non-null  float64
 4   x4      10000 non-null  float64
 5   x5      10000 non-null  float64
 6   x6      10000 non-null  float64
 7   x7      10000 non-null  float64
 8   y       10000 non-null  int64  
 9   y_pred  10000 non-null  float64
dtypes: float64(9), int64(1)
memory usage: 859.4 KB
None


## Save the Output

In [18]:
output_df.to_csv('./data/output_large_df.csv', index=False)

## Calculate the Mean Squared Error

In [19]:
from sklearn.metrics import mean_squared_error
print('Mean squared error: ', round(mean_squared_error(y_test['y'], y_pred_knn), 2))

Mean squared error:  1.56
