# Exploratory Data Analysis (EDA)

This notebook reads taxi data from S3 bucket and performs basic exploratory data analysis.

In [None]:
# Import required libraries
import pandas as pd
import boto3
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Initialize S3 client
s3_client = boto3.client('s3')
bucket_name = 'taxi-demo-data-bucket'

In [None]:
# List objects in the bucket to see available data files
response = s3_client.list_objects_v2(Bucket=bucket_name)

if 'Contents' in response:
    print("Available files in bucket:")
    for obj in response['Contents']:
        print(f"- {obj['Key']} ({obj['Size']} bytes)")
else:
    print("No files found in bucket")

In [None]:
# Read the first CSV file found in the bucket
# Note: Update the key if you know the specific file name
csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]

if csv_files:
    # Use the first CSV file found
    file_key = csv_files[0]
    print(f"Reading file: {file_key}")
    
    # Get the object from S3
    obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    
    # Read CSV data into pandas DataFrame
    df = pd.read_csv(obj['Body'])
    
    print(f"Successfully loaded {len(df)} rows and {len(df.columns)} columns")
else:
    print("No CSV files found in bucket")
    df = None

In [None]:
# Display basic information about the dataset
if df is not None:
    print("Dataset Shape:", df.shape)
    print("\nColumn Names:")
    print(df.columns.tolist())
    
    print("\nFirst 5 rows:")
    display(df.head())
    
    print("\nDataset Info:")
    df.info()

In [None]:
# Basic descriptive statistics
if df is not None:
    print("Descriptive Statistics:")
    display(df.describe())

In [None]:
# Check for missing values
if df is not None:
    print("Missing Values:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])
    
    if missing_values.sum() == 0:
        print("No missing values found!")