## Import Libraries

In [46]:
import sys
import re
import os
output_dir = "../Data/ProcessedData"
os.makedirs(output_dir, exist_ok=True)

import pandas as pd
import numpy as np
import seaborn as sns

## Import Data

In [47]:
data = pd.read_csv("../Data/RawData/SeoulBikeData.csv", encoding='latin1')
data

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,30/11/2018,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Autumn,No Holiday,Yes
8756,30/11/2018,764,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8757,30/11/2018,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8758,30/11/2018,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Autumn,No Holiday,Yes


## Data Cleaning

In [48]:
data.describe()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
count,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,704.602055,11.5,12.882922,58.226256,1.724909,1436.825799,4.073813,0.569111,0.148687,0.075068
std,644.997468,6.922582,11.944825,20.362413,1.0363,608.298712,13.060369,0.868746,1.128193,0.436746
min,0.0,0.0,-17.8,0.0,0.0,27.0,-30.6,0.0,0.0,0.0
25%,191.0,5.75,3.5,42.0,0.9,940.0,-4.7,0.0,0.0,0.0
50%,504.5,11.5,13.7,57.0,1.5,1698.0,5.1,0.01,0.0,0.0
75%,1065.25,17.25,22.5,74.0,2.3,2000.0,14.8,0.93,0.0,0.0
max,3556.0,23.0,39.4,98.0,7.4,2000.0,27.2,3.52,35.0,8.8


In [49]:
print(data.columns)


Index(['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')


In [50]:
print(data.columns.tolist())

['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons', 'Holiday', 'Functioning Day']


In [51]:
print(data.dtypes)

Date                          object
Rented Bike Count              int64
Hour                           int64
Temperature(°C)              float64
Humidity(%)                    int64
Wind speed (m/s)             float64
Visibility (10m)               int64
Dew point temperature(°C)    float64
Solar Radiation (MJ/m2)      float64
Rainfall(mm)                 float64
Snowfall (cm)                float64
Seasons                       object
Holiday                       object
Functioning Day               object
dtype: object


In [None]:
# Remove units like (°C), (cm), (%), etc. using regex
data.columns = [re.sub(r'\(.*?\)', '', col).strip() for col in data.columns]

# Replace spaces with underscores for easier handling
data.columns = data.columns.str.replace(' ', '_')

print("✅ Cleaned Columns:")
print(data.columns.tolist())

✅ Cleaned Columns:
['Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity', 'Wind_speed', 'Visibility', 'Dew_point_temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall', 'Seasons', 'Holiday', 'Functioning_Day']


In [53]:
data.columns = data.columns.str.strip()  # remove extra spaces
data = data.rename(columns=lambda x: x.replace(" ", "_"))  # clean column names

In [54]:
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')  

In [55]:
# Holiday: No Holiday → 0, Holiday → 1
data['Holiday'] = data['Holiday'].map({'No Holiday': 0, 'Holiday': 1})

# Functioning Day: Yes → 1, No → 0
data['Functioning_Day'] = data['Functioning_Day'].map({'Yes': 1, 'No': 0})

# Seasons: Winter → 1, Spring → 2, Summer → 3, Autumn → 4
data['Seasons'] = data['Seasons'].map({'Winter': 1, 'Spring': 2, 'Summer': 3, 'Autumn': 4})


In [None]:
# Vérification des valeurs manquantes
print(data.isnull().sum())
data = data.dropna()


Date                     0
Rented_Bike_Count        0
Hour                     0
Temperature              0
Humidity                 0
Wind_speed               0
Visibility               0
Dew_point_temperature    0
Solar_Radiation          0
Rainfall                 0
Snowfall                 0
Seasons                  0
Holiday                  0
Functioning_Day          0
dtype: int64


In [None]:
# Suppression des doublons
data = data.drop_duplicates()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date                   8760 non-null   datetime64[ns]
 1   Rented_Bike_Count      8760 non-null   int64         
 2   Hour                   8760 non-null   int64         
 3   Temperature            8760 non-null   float64       
 4   Humidity               8760 non-null   int64         
 5   Wind_speed             8760 non-null   float64       
 6   Visibility             8760 non-null   int64         
 7   Dew_point_temperature  8760 non-null   float64       
 8   Solar_Radiation        8760 non-null   float64       
 9   Rainfall               8760 non-null   float64       
 10  Snowfall               8760 non-null   float64       
 11  Seasons                8760 non-null   int64         
 12  Holiday                8760 non-null   int64         
 13  Fun

In [58]:
print(data.columns.tolist())

['Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity', 'Wind_speed', 'Visibility', 'Dew_point_temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall', 'Seasons', 'Holiday', 'Functioning_Day']


In [59]:
print(data.dtypes)

Date                     datetime64[ns]
Rented_Bike_Count                 int64
Hour                              int64
Temperature                     float64
Humidity                          int64
Wind_speed                      float64
Visibility                        int64
Dew_point_temperature           float64
Solar_Radiation                 float64
Rainfall                        float64
Snowfall                        float64
Seasons                           int64
Holiday                           int64
Functioning_Day                   int64
dtype: object


In [60]:
#  Define output file path
output_file = os.path.join(output_dir, "SeoulBikeData_cleaned.csv")

# Save the preprocessed dataset
data.to_csv(output_file, index=False, encoding='utf-8')

print(f" Cleaned data saved successfully at: {output_file}")

 Cleaned data saved successfully at: ../Data/ProcessedData\SeoulBikeData_cleaned.csv
