In [None]:

# Bike Sharing Analysis: Weather Impact on Daily Rentals
# Using train/test split from Kaggle UCI Bike Rental Dataset
# Author: [Your Name]
# Date: 2025-06-09

# %% [markdown]
# # Bike Sharing Data Analysis
# **Objective:** Assess how weather conditions influence daily bike rentals using the provided train/test split.

# %%
# ---
# Step 0: Imports and Configuration
# ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations

# Display settings
pd.set_option('display.max_columns', None)
%matplotlib inline

# %% [markdown]
# ## 1. Data Loading & Preprocessing

# %%
# Load Kaggle train and test files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(f"Train shape: {df_train.shape}, Test shape: {df_test.shape}")

# Combine for uniform preprocessing (but keep split label)
df_train['set'] = 'train'
df_test['set'] = 'test'
# Note: test.csv may lack 'cnt' column; drop or fill for concat
if 'cnt' not in df_test.columns:
    df_test['cnt'] = np.nan

df = pd.concat([df_train, df_test], ignore_index=True)
print(f"Combined shape: {df.shape}")

# Check missing values
print("Missing per column:\n", df.isnull().sum())

# Map weather codes to descriptions and categorical dtype
weather_map = {1: 'Clear/Few clouds', 2: 'Mist/Cloudy', 3: 'Light Snow/Rain', 4: 'Heavy Rain/Snow'}
df['weathersit_desc'] = df['weathersit'].map(weather_map)
df['weathersit'] = df['weathersit'].astype('category')

# Convert date column to datetime and extract year/month/day if needed
df['dteday'] = pd.to_datetime(df['dteday'])

# Anomaly detection on training set counts only
df_tr = df[df['set']=='train']
Q1 = df_tr['cnt'].quantile(0.25)
Q3 = df_tr['cnt'].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
outliers = df_tr[(df_tr['cnt']<lower) | (df_tr['cnt']>upper)]
print(f"Training outliers: {len(outliers)}")

# %% [markdown]
# ## 2. Statistical Analysis on Training Data

# %%
# Aggregate stats by weather on training only
grouped = df_tr.groupby('weathersit').agg(
    avg_cnt=('cnt','mean'), median_cnt=('cnt','median'), std_cnt=('cnt','std'), days=('cnt','size')
).reset_index()
grouped['weather'] = grouped['weathersit'].map(weather_map)

# Pairwise percent changes
avgs = dict(zip(grouped['weathersit'], grouped['avg_cnt']))
pct_list = []
for w1, w2 in combinations(grouped['weathersit'],2):
    pct = (avgs[w2]-avgs[w1])/avgs[w1]*100
    pct_list.append({'from':weather_map[w1],'to':weather_map[w2],'pct_change':pct})
pct_df = pd.DataFrame(pct_list)

print(grouped)
print(pct_df)

# %% [markdown]
# ## 3. Data Visualization (Training)

# %%
# Bar chart of average rentals by weather
fig, ax = plt.subplots(figsize=(9,5))
bars = ax.bar(grouped['weather'], grouped['avg_cnt'], edgecolor='k')
ax.set_title('Avg Daily Rentals by Weather (Train)')
ax.set_xlabel('Weather')
ax.set_ylabel('Avg Rentals')
clear_val = grouped.loc[grouped['weathersit']==1,'avg_cnt'].iloc[0]
for bar, w in zip(bars,grouped['weathersit']):
    h = bar.get_height()
    ax.text(bar.get_x()+bar.get_width()/2, h+500, f"{h:.0f}\n({(h-clear_val)/clear_val*100:+.1f}%)", ha='center')
plt.tight_layout(); plt.show()

# Scatter plots: temp and humidity vs cnt (train)
for feat in ['temp','hum']:
    plt.figure(figsize=(6,4))
    plt.scatter(df_tr[feat],df_tr['cnt'],alpha=0.5)
    plt.title(f"{feat.capitalize()} vs Rentals (Train)")
    plt.xlabel(feat.capitalize())
    plt.ylabel('Rentals')
    plt.tight_layout(); plt.show()

# %% [markdown]
# ## 4. Correlation Analysis on Training

# %%
num_cols = ['temp','atemp','hum','windspeed','cnt']
corr = df_tr[num_cols].corr()['cnt'].sort_values(ascending=False)
print(corr)

# %% [markdown]
# ## 5. Interpretation & Recommendations

# %% [markdown]
# **Summary:** Clear weather has highest rentals; deterioration in weather yields significant drops. Temperature correlates positively; humidity negatively.
# **Next Steps:** Use model training on `train.csv` to predict `cnt` in `test.csv`, incorporating weather features.```
