In [None]:

import pandas as pd

# Load CSV
df = pd.read_csv('Uber-Jan-Feb-FOIL.csv')
df.head()


In [None]:

# Check info
print(df.info())

# Convert date
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Remove rows with missing values in key columns
df.dropna(subset=['dispatching_base_number', 'date', 'active_vehicles', 'trips'], inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Rename columns
df.columns = df.columns.str.lower().str.replace(' ', '_')

df.head()


In [None]:

# Add Day of Week
df['day_of_week'] = df['date'].dt.day_name()

# Encode base
df['base_encoded'] = df['dispatching_base_number'].astype('category').cat.codes

# Peak hour flag (optional)
df['is_peak'] = df['day_of_week'].isin(['Saturday', 'Sunday'])

df.head()


In [None]:

# Basic stats
print(df.describe())

# Mode for categorical
print(df.mode(numeric_only=False).iloc[0])


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Trips distribution
plt.figure(figsize=(10,5))
sns.histplot(df['trips'], bins=30, kde=True)
plt.title('Distribution of Trips')
plt.xlabel('Number of Trips')
plt.ylabel('Frequency')
plt.show()

# Trips vs Active Vehicles
plt.figure(figsize=(10,5))
sns.scatterplot(data=df, x='active_vehicles', y='trips', hue='dispatching_base_number')
plt.title('Trips vs Active Vehicles')
plt.show()

# Average trips per day
avg_trips = df.groupby('day_of_week')['trips'].mean().reindex(
    ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])

avg_trips.plot(kind='bar', title='Average Trips per Day', figsize=(8,4))
plt.ylabel('Trips')
plt.show()


In [None]:

# Save cleaned data
df.to_csv('enhanced_uber_fares.csv', index=False)
print("✅ Data exported successfully.")
