In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [73]:
task_2a = pd.read_csv('task2a.csv')
task_2b = pd.read_csv('task2b.csv')

In [74]:
joined_bike = pd.merge(task_2a, task_2b, on='timestamp', how='left')
joined_bike['timestamp'] = pd.to_datetime(joined_bike['timestamp'],errors = 'coerce')
df = joined_bike.drop(columns=['feels_like_temp'])
len(df)

17414

In [75]:
df.isna().sum()

timestamp          10568
new_bike_shares        0
temperature            0
humidity               0
wind_speed             0
is_weekend             0
season_code            0
dtype: int64

In [76]:
df[df.duplicated()]

Unnamed: 0,timestamp,new_bike_shares,temperature,humidity,wind_speed,is_weekend,season_code
3445,NaT,990,15.5,61.0,19.5,0,0
11287,NaT,47,6.0,93.0,7.0,0,0
14144,NaT,74,17.0,88.0,8.0,0,1
14411,NaT,725,15.5,74.5,9.0,0,1


## Remove outliers

### Method 1: using z-score

In [77]:
df['temperature'].describe()

count    17414.000000
mean        12.468091
std          5.571818
min         -1.500000
25%          8.000000
50%         12.500000
75%         16.000000
max         34.000000
Name: temperature, dtype: float64

In [78]:
from scipy.stats import zscore

df['z_score'] = zscore(df['temperature'])

# Define threshold for outliers (e.g., z_score > 3 or z_score < -3)
threshold = 3 

# Filter rows where absolute z-score exceeds the threshold
df_filtered = df[abs(df['z_score']) <= threshold].copy()
df_filtered.drop(columns=['z_score'], inplace=True)
len(df_filtered)

17369

### Method 2: using inter-quartile range (IQR)

In [79]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out rows with outlier values in any column
outliers_iqr = ((df < lower_bound) | (df > upper_bound)).any(axis=1)
df_filtered_iqr = df[~outliers_iqr]

len(df_filtered_iqr)

16408