In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv("/kaggle/input/bikeshare-data/bike_share.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
%matplotlib inline 

In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates()

In [None]:
data.isnull().sum()

In [None]:
data.describe().T

In [None]:
sns.pairplot(data)

In [None]:
plt.subplots(figsize=(20,15))
sns.heatmap(data.corr(), annot=True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
X = data.drop(columns="count")

In [None]:
# calculating VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:
#atemp and temp have high collinearity and hence dropping temp

In [None]:
X = data.drop(columns=["count","temp"])

In [None]:
# calculating VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:
X = data.drop(columns=["count","temp"])

In [None]:
y = data["count"]

In [None]:
model = LinearRegression()

In [None]:
model.fit(X,y)

In [None]:
model.coef_


In [None]:
model.intercept_

In [None]:
y_pred = model.predict(X)

In [None]:
y_pred

In [None]:
plt.scatter(data["count"],y_pred, color='black')
plt.show()

In [None]:
print("MSE : ",mean_squared_error(y,y_pred))

In [None]:
print("MAE : ",mean_absolute_error(y,y_pred))

In [None]:
print("RMSE : ",np.sqrt(mean_squared_error(y,y_pred)))

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
print("MAPE : ",mean_absolute_percentage_error(y,y_pred))

In [None]:
def linear_best_fit(X,y):
    model = LinearRegression()
    model.fit(X,y)
    y_pred = model.predict(X)
    print("MSE : ",mean_squared_error(y,y_pred))
    print("MAE : ",mean_absolute_error(y,y_pred))
    print("RMSE : ",np.sqrt(mean_squared_error(y,y_pred)))
    print("MAPE : ",mean_absolute_percentage_error(y,y_pred))

def mean_absolute_percentage_error(y, y_pred): 
    y, y_pred = np.array(y), np.array(y_pred)
    return np.mean(np.abs((y - y_pred) / y)) * 100


In [None]:
linear_best_fit(X,y)

In [None]:
plt.xlabel("Actual bike count")
plt.ylabel("Predicted bike count")

plt.scatter(y, y_pred)

In [None]:
error = y-y_pred
sns.displot(error)

In [None]:
error = y-y_pred

In [None]:
plt.scatter( y_pred, error)

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X,y,test_size = 0.3, random_state=90)

In [None]:
X_train.head()


In [None]:
X_test.head()

In [None]:
y_train

In [None]:
y_test

In [None]:
linear_best_fit(X_train,y_train)

In [None]:
linear_best_fit(X_test,y_test)

In [None]:
# train MAPE : 4.2
# test MAPE : 3.5

In [None]:
kf = KFold(n_splits=10)

In [None]:
i=1
test_result = []
for train_index, test_index in kf.split(X):
    train_X = X.loc[train_index]
    train_y = y[train_index]
    test_X = X.loc[test_index]
    test_y = y[test_index]
    
    model = LinearRegression()
    model.fit(train_X, train_y)

    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    
    train_mape = np.sqrt(mean_squared_error(train_y, train_pred))
    test_mape = np.sqrt(mean_squared_error(test_y, test_pred))
    print(i, "Fold")
    print("Train_RMSE = ",train_mape)
    print("Test RMSE = ",test_mape)
    
    test_result.append(test_mape)
    

In [None]:
np.mean(test_result)

In [None]:
np.std(test_result)

In [None]:
model = LinearRegression()
np.mean(np.abs(cross_val_score(model, X, y, scoring = 'neg_root_mean_squared_error', cv = 10)))

In [None]:
import sklearn
sklearn.__version__

In [None]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X,y,test_size = 0.3, random_state=90)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
train_pred = model.predict(X_train)

In [None]:
test_pred = model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_train, train_pred))

In [None]:
np.sqrt(mean_squared_error(y_test, test_pred))

In [None]:
((101989.58718331138-100780.44137721983)/101989.58718331138)*100

In [None]:
r2 = r2_score(y_train, train_pred)

In [None]:
r2

In [None]:
n, p = X_train.shape[0], X_train.shape[1]

In [None]:
X_train.shape

In [None]:
r2