<img width="200" src="https://raw.githubusercontent.com/lukwies/mid-bootcamp-project/main/data/img/bikes.png">

---


# Bikesharing in Seoul / Prediction

---

### Sources

 * Data: https://archive.ics.uci.edu/ml/datasets/Seoul+Bike+Sharing+Demand
 * Image: https://global.chinadaily.com.cn/a/201801/25/WS5a69cab3a3106e7dcc136a6d.html

---

### Tasks

 

In [1]:
import pandas as pd
import numpy as np
import yaml
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor

#### Load YAML config file

In [2]:
with open('../params.yaml') as file:
    config = yaml.safe_load(file)

#### Load cleaned dataset

In [3]:
df = pd.read_csv(config['data']['csv_cleaned'])

#### Let's drop column date since we don't need it for prediction

In [4]:
df = df.drop(['date', 'daytime', 'seasons',
              'functioning_day', 'temperature_type',
              'solar_radiation'], axis=1)

### Do X/y-split

In [5]:
X = df.drop(['rented_bike_count'], axis=1)
y = df['rented_bike_count']

### Do train/test-split

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X, y,
                                                 test_size=config['testsplit']['test_size'],
                                                 random_state=config['testsplit']['random_state'])

# Split training set into numerical and categorical columns
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)

# Split test set into numerical and categorical columns
X_test_num = X_test.select_dtypes(np.number)
X_test_cat = X_test.select_dtypes(object)

In [7]:
X_train_num.head()

Unnamed: 0,month,hour,weekday,temperature,humidity,wind_speed,visibility,rainfall,snowfall
1444,1,4,1,-11.1,50,1.2,1986,0.0,0.0
1652,2,20,2,-5.8,44,2.1,1994,0.0,0.0
7496,10,8,1,11.4,66,0.8,1991,0.0,0.0
1893,2,21,5,-2.3,38,2.3,2000,0.0,0.0
3880,5,16,4,19.1,54,3.2,542,0.0,0.0


In [8]:
X_train_cat.head()

Unnamed: 0,holiday
1444,No
1652,No
7496,Yes
1893,Yes
3880,No


<br>

### Apply OneHotEncoder on categorical columns

In [9]:
ohe = OneHotEncoder(drop='first')
ohe.fit(X_train_cat)

X_train_cat_enc = ohe.transform(X_train_cat).toarray()
X_test_cat_enc  = ohe.transform(X_test_cat).toarray()

X_train_cat_enc = pd.DataFrame(X_train_cat_enc, columns=ohe.get_feature_names_out(), index=X_train_cat.index)
X_test_cat_enc  = pd.DataFrame(X_test_cat_enc, columns=ohe.get_feature_names_out(), index=X_test_cat.index)

#### Store OneHotEncoder to file

In [10]:
with open(config['encoder']['onehot'], 'wb') as file:
    pickle.dump(ohe, file, pickle.HIGHEST_PROTOCOL)

<br>

### Apply StandardScaler to numerical columns

In [11]:
stdScaler = StandardScaler()
stdScaler.fit(X_train_num)

X_train_num_scaled = stdScaler.transform(X_train_num)
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns=X_train_num.columns, index=X_train_num.index)

X_test_num_scaled = stdScaler.transform(X_test_num)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns=X_test_num.columns, index=X_test_num.index)

#### Store StandardScaler to file

In [12]:
with open(config['scaler']['standard'], 'wb') as file:
    pickle.dump(stdScaler, file, pickle.HIGHEST_PROTOCOL)

<br>

### Apply MinMaxScaler to numerical columns

In [13]:
minmax = MinMaxScaler()
minmax.fit(X_train_num_scaled)

X_train_num_scaled = minmax.transform(X_train_num_scaled)
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns=X_train_num.columns, index=X_train_num.index)

X_test_num_scaled = minmax.transform(X_test_num_scaled)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns=X_test_num.columns, index=X_test_num.index)

#### Store MinMaxScaler to file

In [14]:
with open(config['scaler']['minmax'], 'wb') as file:
    pickle.dump(minmax, file, pickle.HIGHEST_PROTOCOL)

#### Build final dataframes

In [15]:
X_train_final = pd.concat([X_train_cat_enc, X_train_num_scaled], axis=1)
X_test_final  = pd.concat([X_test_cat_enc, X_test_num_scaled], axis=1)

<br>

### Apply KNeighborRegression

In [16]:
n_neighbors = 5
knn = KNeighborsRegressor(n_neighbors, weights='distance')
knn.fit(X_train_final, y_train)

#### Store KNN model to file

In [17]:
with open(config['model']['KNN'], 'wb') as file:
    pickle.dump(knn, file, pickle.HIGHEST_PROTOCOL)

<br>

### Validate model

In [18]:
y_train_pred = knn.predict(X_train_final)
y_test_pred  = knn.predict(X_test_final)

train_score = knn.score(X_train_final, y_train)
test_score  = knn.score(X_test_final, y_test)

In [19]:
print(f'Train score: {train_score}')
print(f'Test score: {test_score}')

Train score: 1.0
Test score: 0.7591879129374568


In [None]:
fig,ax = plt.subplots(1, 2, figsize=(8,4))

sx = sns.scatterplot(y=y_train_pred, x=y_train, ax=ax[0])
sx = sns.lineplot(y=y_train, x=y_train, color='red', ax=ax[0])
sx.set(title='Prediction on train set', xlabel='y', ylabel='y-predicted')

sx = sns.scatterplot(y=y_test_pred, x=y_test, ax=ax[1])
sx = sns.lineplot(y=y_test, x=y_test, color='red', ax=ax[1])
sx.set(title='Prediction on test set', xlabel='y', ylabel='y-predicted')

plt.tight_layout()
plt.show()

In [None]:
fig,ax = plt.subplots(1,3)

sax = sns.lineplot(data=X_train_final, x='temperature', y=y_train, ax=ax[0])
sax.set(ylabel='Real bike rentals', ylim=(0,3000))
sax.grid()

sax = sns.lineplot(data=X_train_final, x='temperature', y=y_train_pred, ax=ax[1])
sax.set(ylabel='Predicted train bike rentals', ylim=(0,3000))
sax.grid()

sax = sns.lineplot(data=X_test_final, x='temperature', y=y_test_pred, ax=ax[2])
sax.set(ylabel='Predicted test bike rentals', ylim=(0,3000))
sax.grid()

plt.tight_layout()
plt.show()

In [None]:
fig,ax = plt.subplots(1,3)

sax = sns.lineplot(data=X_train_final, x='hour', y=y_train, ax=ax[0])
sax.set(ylabel='Real bike rentals', ylim=(0,3000))
sax.grid()

sax = sns.lineplot(data=X_train_final, x='hour', y=y_train_pred, ax=ax[1])
sax.set(ylabel='Predicted train bike rentals', ylim=(0,3000))
sax.grid()

sax = sns.lineplot(data=X_test_final, x='hour', y=y_test_pred, ax=ax[2])
sax.set(ylabel='Predicted test bike rentals', ylim=(0,3000))
sax.grid()

plt.tight_layout()
plt.show()

In [None]:
scores = []
for k in range(2,30):
    model = KNeighborsRegressor(n_neighbors=k)

    model.fit(X_train_final, y_train)
   # knn_models.append(model)

    score = model.score(X_test_final, y_test)
    scores.append(score)
    print(f'Neighbours={k} score={score}')

plt.figure(figsize=(10,6))
plt.plot(range(2,30), scores, color = 'blue', linestyle='dashed',
         marker='o', markerfacecolor='red', markersize=10)
plt.title('R2-scores vs. K Value')
plt.xticks(range(1,30))
#plt.gca().invert_yaxis()
plt.xlabel('K')
plt.ylabel('R2')