In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class COVIDDataAnalyzer:
    def __init__(self, data_path='owid-covid-data.csv'):
        self.data_path = data_path
        self.df = None
        self.countries = ['Kenya', 'India', 'United States']
        
    def load_data(self):
        """Load and preprocess the COVID-19 dataset"""
        try:
            if not Path(self.data_path).exists():
                raise FileNotFoundError(f"Data file {self.data_path} not found")
                
            logger.info(f"Loading data from {self.data_path}")
            self.df = pd.read_csv(self.data_path)
            
            # Basic data cleaning
            self.df['date'] = pd.to_datetime(self.df['date'], errors='coerce')
            self.df = self.df[self.df['location'].isin(self.countries)]
            
            # Drop rows with missing critical values
            self.df = self.df.dropna(subset=['total_cases', 'total_deaths'])
            
            # Fill missing numeric values
            numeric_cols = self.df.select_dtypes(include=['number']).columns
            self.df[numeric_cols] = self.df[numeric_cols].fillna(method='ffill')
            
            logger.info(f"Data loaded successfully. Shape: {self.df.shape}")
            return True
            
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            return False
    
    def plot_cases_over_time(self, save_path=None):
        """Plot total COVID-19 cases over time"""
        if self.df is None:
            logger.error("Data not loaded. Call load_data() first.")
            return
            
        plt.figure(figsize=(14, 7))
        for country in self.countries:
            subset = self.df[self.df['location'] == country]
            plt.plot(subset['date'], subset['total_cases'], label=country, linewidth=2)
        
        plt.title('Total COVID-19 Cases Over Time', fontsize=16)
        plt.xlabel('Date', fontsize=12)
        plt.ylabel('Total Cases (log scale)', fontsize=12)
        plt.yscale('log')  # Using log scale for better visualization
        plt.legend(title='Country', fontsize=10)
        plt.grid(True, which="both", ls="--")
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            logger.info(f"Saved cases plot to {save_path}")
        else:
            plt.show()
        plt.close()
    
    def plot_vaccination_progress(self, save_path=None):
        """Plot vaccination progress over time"""
        if self.df is None:
            logger.error("Data not loaded. Call load_data() first.")
            return
            
        plt.figure(figsize=(14, 7))
        for country in self.countries:
            subset = self.df[self.df['location'] == country]
            plt.plot(subset['date'], subset['total_vaccinations'], label=country, linewidth=2)
        
        plt.title('Vaccination Progress Over Time', fontsize=16)
        plt.xlabel('Date', fontsize=12)
        plt.ylabel('Total Vaccinations', fontsize=12)
        plt.legend(title='Country', fontsize=10)
        plt.grid(True, ls="--")
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            logger.info(f"Saved vaccination plot to {save_path}")
        else:
            plt.show()
        plt.close()
    
    def create_choropleth_map(self, save_path=None):
        """Create interactive choropleth map of total cases"""
        if self.df is None:
            logger.error("Data not loaded. Call load_data() first.")
            return
            
        latest_date = self.df['date'].max()
        latest_df = self.df[self.df['date'] == latest_date]
        
        fig = px.choropleth(latest_df,
                           locations="iso_code",
                           color="total_cases",
                           hover_name="location",
                           hover_data=["total_deaths", "total_vaccinations"],
                           color_continuous_scale=px.colors.sequential.Plasma,
                           title=f"Total COVID-19 Cases by Country (as of {latest_date.strftime('%Y-%m-%d')})")
        
        if save_path:
            fig.write_html(save_path)
            logger.info(f"Saved choropleth map to {save_path}")
        else:
            fig.show()
    
    def generate_report(self, output_dir='output'):
        """Generate all visualizations and save to output directory"""
        Path(output_dir).mkdir(exist_ok=True)
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        cases_path = f"{output_dir}/covid_cases_{timestamp}.png"
        vaccines_path = f"{output_dir}/vaccinations_{timestamp}.png"
        map_path = f"{output_dir}/covid_map_{timestamp}.html"
        
        self.plot_cases_over_time(save_path=cases_path)
        self.plot_vaccination_progress(save_path=vaccines_path)
        self.create_choropleth_map(save_path=map_path)
        
        logger.info(f"Report generated in {output_dir} directory")

# Example usage
if __name__ == "__main__":
    analyzer = COVIDDataAnalyzer()
    if analyzer.load_data():
        analyzer.generate_report()

2025-05-13 23:17:15,209 - INFO - Loading data from owid-covid-data.csv
2025-05-13 23:17:17,400 - INFO - Data loaded successfully. Shape: (5022, 67)
2025-05-13 23:17:18,785 - INFO - Saved cases plot to output/covid_cases_20250513_231717.png
2025-05-13 23:17:19,741 - INFO - Saved vaccination plot to output/vaccinations_20250513_231717.png
2025-05-13 23:17:21,508 - INFO - Saved choropleth map to output/covid_map_20250513_231717.html
2025-05-13 23:17:21,509 - INFO - Report generated in output directory
