<a href="https://colab.research.google.com/github/mygoal-javadeveloper/Dphi_Machine-Learning_Datathon_and_Assignments/blob/master/hour_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing libraries 
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
#reading training dataset
hour_data  = pd.read_csv('hour.csv')

In [3]:
#finding the total rows and columns of dataset
hour_data.shape

(17379, 17)

In [4]:
#getting the overview of all the columns in the dataset
hour_data.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [5]:
hour_data.rename(columns={'dteday':'date', 'yr':'year', 'mnth':'month', 'hr':'hour', 'hum':'humidity', 'cnt':'totalcount'}, inplace=True)

In [6]:
hour_data.columns

Index(['instant', 'date', 'season', 'year', 'month', 'hour', 'holiday',
       'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'humidity',
       'windspeed', 'casual', 'registered', 'totalcount'],
      dtype='object')

In [7]:
#first 5 rows content of the dataset
hour_data.head(15)

Unnamed: 0,instant,date,season,year,month,hour,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,totalcount
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1
5,6,2011-01-01,1,0,1,5,0,6,0,2,0.24,0.2576,0.75,0.0896,0,1,1
6,7,2011-01-01,1,0,1,6,0,6,0,1,0.22,0.2727,0.8,0.0,2,0,2
7,8,2011-01-01,1,0,1,7,0,6,0,1,0.2,0.2576,0.86,0.0,1,2,3
8,9,2011-01-01,1,0,1,8,0,6,0,1,0.24,0.2879,0.75,0.0,1,7,8
9,10,2011-01-01,1,0,1,9,0,6,0,1,0.32,0.3485,0.76,0.0,8,6,14


In [8]:
#getting brief overview of the dataset - number of columns and rows (shape of dataset), columns names and its dtype, how many non-null values it has and memory usage.
hour_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   date        17379 non-null  object 
 2   season      17379 non-null  int64  
 3   year        17379 non-null  int64  
 4   month       17379 non-null  int64  
 5   hour        17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  humidity    17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  totalcount  17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [9]:
#just extra checking for null values
hour_data.isnull().sum()

instant       0
date          0
season        0
year          0
month         0
hour          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
totalcount    0
dtype: int64

In [10]:
#checking for duplicates values
hour_data.duplicated().sum()

0

In [11]:
hour_data.describe()

Unnamed: 0,instant,season,year,month,hour,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,totalcount
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


In [12]:
for x in hour_data.columns:
  print(x)
  print(len(hour_data[x].unique()))
  print()

instant
17379

date
731

season
4

year
2

month
12

hour
24

holiday
2

weekday
7

workingday
2

weathersit
4

temp
50

atemp
65

humidity
89

windspeed
30

casual
322

registered
776

totalcount
869



In [13]:
#le = LabelEncoder()
#hour_data['date'] = le.fit_transform(hour_data['date'])

In [14]:
hour_data.drop(['instant','date','temp','atemp','humidity','windspeed','casual','registered'], axis = 1, inplace = True) 

In [15]:
hour_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   season      17379 non-null  int64
 1   year        17379 non-null  int64
 2   month       17379 non-null  int64
 3   hour        17379 non-null  int64
 4   holiday     17379 non-null  int64
 5   weekday     17379 non-null  int64
 6   workingday  17379 non-null  int64
 7   weathersit  17379 non-null  int64
 8   totalcount  17379 non-null  int64
dtypes: int64(9)
memory usage: 1.2 MB


In [16]:
#initializing the X (independent variables) and y (dependent variable)
X = hour_data.drop('totalcount', axis = 1)
y = hour_data['totalcount']

In [17]:
#splitting the X (independent variables) and y (dependent variable) into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)

In [18]:
lr = LinearRegression()

In [19]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
pred = lr.predict(X_test)

In [21]:
lr.score(X_test, y_test)

0.2772427003502226

In [22]:
r2_score(y_test, pred)

0.2772427003502226

In [23]:
#evaluating the performance of the model
#MAE
print(mean_absolute_error(y_test, pred), end='\n')
#MSE
print(mean_squared_error(y_test, pred), end='\n')
#RMSE
print(np.sqrt(mean_squared_error(y_test, pred)))

117.16129011241753
24585.42941746526
156.7974152129596
