In [16]:
#Step 1: Data Preparation
#!pip install pandas numpy matplotlib seaborn scikit-learn

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("C:\\Users\\PuMishra\\Desktop\\day.csv")

# Display the first few rows
df.head()


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [17]:
#Step 2: Data Cleaning and Preparation
# Convert season, yr, mnth, weekday, weathersit to categorical
df['season'] = df['season'].map({1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'})
df['weathersit'] = df['weathersit'].map({1: 'Clear', 2: 'Mist + Cloudy', 3: 'Light Snow/Rain', 4: 'Heavy Rain/Snow'})
df['yr'] = df['yr'].map({0: '2018', 1: '2019'})
df['mnth'] = df['mnth'].astype('category')
df['weekday'] = df['weekday'].astype('category')

# Check for missing values
df.isnull().sum()


instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [18]:
#Step 3: Feature Engineering
# Create dummy variables for categorical columns
df = pd.get_dummies(df, columns=['season', 'weathersit', 'mnth', 'weekday'], drop_first=True)


In [19]:
#Step 4: Splitting the Data
# Define the target variable and features
X = df.drop(['cnt', 'dteday', 'casual', 'registered'], axis=1)
y = df['cnt']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
#Step 5: Model Building
# Create a linear regression model
lr = LinearRegression()

# Train the model on the training data
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)


In [21]:
#Step 6: Model Evaluation
# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print(f"R-squared score on test set: {r2}")

# Display coefficients
coefficients = pd.DataFrame(lr.coef_, X.columns, columns=['Coefficient'])
coefficients.sort_values(by='Coefficient', ascending=False)


R-squared score on test set: 0.8628948404753912


Unnamed: 0,Coefficient
yr,4444.235609
mnth_9,2383.194123
mnth_10,2166.131222
mnth_12,1987.930058
mnth_11,1668.582957
mnth_8,1527.776076
mnth_5,1251.661198
mnth_6,1206.57071
mnth_7,1002.973083
mnth_3,933.167357
