In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("bike_cleaned_data.csv")

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   day                      8760 non-null   object 
 1   date                     8760 non-null   int64  
 2   month                    8760 non-null   int64  
 3   year                     8760 non-null   int64  
 4   Rented Bike Count        8760 non-null   int64  
 5   Hour                     8760 non-null   int64  
 6   Temperature(°C)          8760 non-null   float64
 7   Humidity(%)              8760 non-null   int64  
 8   Wind speed (m/s)         8760 non-null   float64
 9   Visibility (10m)         8760 non-null   int64  
 10  Solar Radiation (MJ/m2)  8760 non-null   float64
 11  Rainfall(mm)             8760 non-null   float64
 12  Snowfall (cm)            8760 non-null   float64
 13  Seasons                  8760 non-null   object 
 14  Holiday                 

In [6]:
df.columns

Index(['day', 'date', 'month', 'year', 'Rented Bike Count', 'Hour',
       'Temperature(°C)', 'Humidity(%)', 'Wind speed (m/s)',
       'Visibility (10m)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)',
       'Snowfall (cm)', 'Seasons', 'Holiday', 'Functioning Day'],
      dtype='object')

In [9]:
Q1 = df['Rented Bike Count'].quantile(0.25)
Q3 = df['Rented Bike Count'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers_sqft = df[(df['Rented Bike Count'] < lower_bound) | (df['Rented Bike Count'] > upper_bound)]

# Displaying the number of outliers and some statistics
num_outliers = outliers_sqft.shape[0]
outliers_sqft_stats = outliers_sqft['Rented Bike Count'].describe()

num_outliers, outliers_sqft_stats

(158,
 count     158.000000
 mean     2698.873418
 std       304.303962
 min      2377.000000
 25%      2454.250000
 50%      2596.000000
 75%      2881.250000
 max      3556.000000
 Name: Rented Bike Count, dtype: float64)

In [13]:
outliers_sqft.sort_values('Rented Bike Count',ascending=False).head(10)

Unnamed: 0,day,date,month,year,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
4818,Tuesday,19,6,2018,3556,18,24.1,57,2.9,1301,0.56,0.0,0.0,Summer,No Holiday,Yes
4866,Thursday,21,6,2018,3418,18,27.8,43,3.0,1933,1.35,0.0,0.0,Summer,No Holiday,Yes
4650,Tuesday,12,6,2018,3404,18,24.9,53,3.6,2000,1.28,0.0,0.0,Summer,No Holiday,Yes
4842,Wednesday,20,6,2018,3384,18,27.0,55,3.1,1246,1.26,0.0,0.0,Summer,No Holiday,Yes
4458,Monday,4,6,2018,3380,18,24.4,48,1.9,1998,0.56,0.0,0.0,Summer,No Holiday,Yes
4890,Friday,22,6,2018,3365,18,29.3,27,3.4,1977,1.24,0.0,0.0,Summer,No Holiday,Yes
4554,Friday,8,6,2018,3309,18,26.2,54,2.2,1183,0.88,0.0,0.0,Summer,No Holiday,Yes
6810,Monday,10,9,2018,3298,18,25.9,42,1.1,2000,0.48,0.0,0.0,Autumn,No Holiday,Yes
6978,Monday,17,9,2018,3277,18,25.3,56,2.8,1992,0.54,0.0,0.0,Autumn,No Holiday,Yes
6858,Wednesday,12,9,2018,3256,18,27.0,44,1.4,2000,0.62,0.0,0.0,Autumn,No Holiday,Yes


In [15]:
def outlier_check_all(df):
    # Loop over all numerical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

    for column in numerical_columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        # Define bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]

        # Display the number of outliers and some statistics
        num_outliers = outliers.shape[0]
        outliers_stats = outliers[column].describe()

        print(f"Column: {column}")
        print(f"Number of Outliers: {num_outliers}")
        print(f"Outliers Statistics:\n{outliers_stats}")
        print(f"Top 10 Outliers in {column}:\n{outliers.sort_values(column, ascending=False).head(10)}")
        print("\n" + "="*50 + "\n")

# Call the function on the dataframe
outlier_check_all(df)


Column: date
Number of Outliers: 0
Outliers Statistics:
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: date, dtype: float64
Top 10 Outliers in date:
Empty DataFrame
Columns: [day, date, month, year, Rented Bike Count, Hour, Temperature(°C), Humidity(%), Wind speed (m/s), Visibility (10m), Solar Radiation (MJ/m2), Rainfall(mm), Snowfall (cm), Seasons, Holiday, Functioning Day]
Index: []


Column: month
Number of Outliers: 0
Outliers Statistics:
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: month, dtype: float64
Top 10 Outliers in month:
Empty DataFrame
Columns: [day, date, month, year, Rented Bike Count, Hour, Temperature(°C), Humidity(%), Wind speed (m/s), Visibility (10m), Solar Radiation (MJ/m2), Rainfall(mm), Snowfall (cm), Seasons, Holiday, Functioning Day]
Index: []


Column: year
Number of Outliers: 744
Outliers Statistics:
count     744.0
mean     2017.0