In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler

import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression

In [None]:
dataset_cols = ["bike_count", "hour", "temp", "humidity", "wind", "visibility", "dew_pt_temp", "radiation", "rain", "snow", "functional"]
df = pd.read_csv("data/SeoulBikeData.csv").drop(["Date", "Holiday", "Seasons"], axis=1)
df.columns=dataset_cols

In [None]:
df["functional"]=(df["functional"]=="Yes").astype(int)
df=df[df["hour"]==12]
df=df.drop(["hour"],axis=1)
df.head()

In [None]:
for label in df.columns[1:]:
  plt.scatter(df[label],df["bike_count"])
  plt.title(label)
  plt.ylabel("Bike count at Noon")
  plt.xlabel(label)
  plt.show()

In [None]:
df=df.drop(["visibility","functional","wind"],axis=1)#we can see from above graphs
#that these arent very useful
df.head()

In [None]:
train,val,test=np.split(df.sample(frac=1),[int(0.6*len(df)),int(0.8*len(df))])

In [None]:
def get_xy(dataframe,y_label,x_labels=None):
   df=copy.deepcopy(dataframe)
   if x_labels is None:
    X=df[[c for c in df.columns if c!=y_label]].values
   else:
    if len(x_labels)==1:
      X=dataframe[x_labels[0]].values.reshape(-1,1)
    else:
      X=dataframe[x_labels].values

   Y=df[y_label].values.reshape(-1,1)
   data=np.hstack((X,Y))
   #print(data)
   return data,X,Y

In [None]:
_,X_train_temp,y_train_temp=get_xy(train,"bike_count",x_labels=["temp"])
_,X_val_temp,y_val_temp=get_xy(val,"bike_count",x_labels=["temp"])
_,X_test_temp,y_test_temp=get_xy(test,"bike_count",x_labels=["temp"])

In [None]:
temp_reg=LinearRegression()
print(X_train_temp.shape)
temp_reg.fit(X_train_temp,y_train_temp)

In [None]:
print(temp_reg.coef_,temp_reg.intercept_)

In [None]:
temp_reg.score(X_test_temp,y_test_temp)#R^2 score

In [None]:
plt.scatter(X_train_temp,y_train_temp,label='Data',color="blue")
x=np.array(tf.linspace(-20,40,100)).reshape(-1,1)
#print(x.shape)
#print(temp_reg.predict(x))
plt.plot(x,temp_reg.predict(x),label="Fit",color="red",linewidth=3)
plt.legend()
plt.title("Bikes vs temp")
plt.ylabel("Number of bikes")
plt.xlabel("Temp")
plt.show()

# Multiple Linear Regression

In [None]:
df.head()

In [None]:
_,X_train_all,y_train_all=get_xy(train,"bike_count",x_labels=df.columns[1:])
_,X_val_all,y_val_all=get_xy(val,"bike_count",x_labels=df.columns[1:])
_,X_test_all,y_test_all=get_xy(test,"bike_count",x_labels=df.columns[1:])

In [None]:
all_reg=LinearRegression()
all_reg.fit(X_train_all,y_train_all)

In [None]:
all_reg.score(X_test_all,y_test_all)

In [None]:
y_pred_lr=all_reg.predict(X_test_all)


# Regression with neural net

In [None]:
#normalize the
temp_normalizer=tf.keras.layers.Normalization(input_shape=(1,),axis=None)
temp_normalizer.adapt(X_train_temp.reshape(-1))

In [None]:
temp_nn_model=tf.keras.Sequential([
    temp_normalizer,
    tf.keras.layers.Dense(1)
])
temp_nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),loss="mean_squared_error")

In [None]:
history=temp_nn_model.fit(
    X_train_temp.reshape(-1),y_train_temp,
    verbose=0,
    epochs=1000,
    validation_data=(X_val_temp,y_val_temp)
)

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'],label='loss')
  plt.plot(history.history['val_loss'],label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('MSE')
  plt.legend()
  plt.grid(True)
  plt.show()

plot_loss(history)

In [None]:
plt.scatter(X_train_temp,y_train_temp,label='Data',color="blue")
x=np.array(tf.linspace(-20,40,100)).reshape(-1,1)
plt.plot(x,temp_nn_model.predict(x),label="Fit",color="red",linewidth=3)
plt.legend()
plt.title("Bikes vs temp")
plt.ylabel("Number of bikes")
plt.xlabel("Temp")
plt.show()

# Neural Net

In [None]:
temp_normalizer=tf.keras.layers.Normalization(input_shape=(1,),axis=None)
temp_normalizer.adapt(X_train_temp.reshape(-1))

In [None]:
nn_model=tf.keras.Sequential([
    temp_normalizer,
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(1 ,activation='relu'),
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss="mean_squared_error")

In [None]:
history=nn_model.fit(
    X_train_temp,y_train_temp,
    validation_data=(X_val_temp,y_val_temp),
    verbose=0,epochs=1000
)

In [None]:
plot_loss(history)

In [None]:
plt.scatter(X_train_temp,y_train_temp,label='Data',color="blue")
x=np.array(tf.linspace(-20,40,100)).reshape(-1,1)
plt.plot(x,nn_model.predict(x),label="Fit",color="red",linewidth=3)
plt.legend()
plt.title("Bikes vs temp")
plt.ylabel("Number of bikes")
plt.xlabel("Temp")
plt.show()

In [None]:
all_normalizer=tf.keras.layers.Normalization(input_shape=(9,),axis=-1)
all_normalizer.adapt(X_train_all)

In [None]:
nn_model=tf.keras.Sequential([
    all_normalizer,
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(1 ,activation='relu'),
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),loss="mean_squared_error")

In [None]:
history=nn_model.fit(
    X_train_all,y_train_all,
    validation_data=(X_val_all,y_val_all),
    verbose=0,epochs=1000
)

In [None]:
plot_loss(history)

In [None]:
#calculate MSE for both linear regressor and neural net
y_pred_lr=all_reg.predict(X_test_all)
y_pred_nn=nn_model.predict(X_test_all)

In [None]:
def MSE(y_pred,y_real):
  return np.square(y_pred-y_real).mean()

In [None]:
print(MSE(y_pred_lr,y_test_all))
print(MSE(y_pred_nn,y_test_all))

In [None]:
ax=plt.axes(aspect="equal")
plt.scatter(y_test_all,y_pred_lr,label="Lin Reg Preds")
plt.scatter(y_test_all,y_pred_nn,label="NN Preds")
plt.xlabel("True Values")
plt.ylabel("Predictions")
lims=[0,1800]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
plt.plot(lims,lims,c="red")