In [1]:
import pandas as pd
import os

# Use the full path instead of relative path
file_path = "/Users/vvmohith/Desktop/PROJECT/dataset-final/advanced_matched_ministries_05-06_to_15-16.csv"

# Check if the file exists before trying to load
if os.path.exists(file_path):
    ministry_mapping = pd.read_csv(file_path)
    print(f"Successfully loaded mapping file with {len(ministry_mapping)} rows")
    
    # Create standard ministry list (use the most recent naming convention)
    standard_ministries = set(ministry_mapping['Matched_Ministry_15_16'].unique())
    print(f"Found {len(standard_ministries)} standardized ministry names")
else:
    print(f"File not found: {file_path}")
    print("You need to create this file first by running the ministry matching code")

Successfully loaded mapping file with 85 rows
Found 75 standardized ministry names


In [2]:
import pandas as pd
import numpy as np
import os
import glob
import re

# Function to clean ministry names - reusing your existing function
def clean_ministry_name(name):
    """Clean ministry names by removing prefixes, numbers, etc."""
    if pd.isna(name):
        return ""
        
    name = str(name).lower()
    name = re.sub(r'^\d+\.\s*', '', name) # Remove numbering
    name = name.replace("ministry of ", "").replace("department of ", "")
    name = name.replace('&', 'and')
    name = name.replace("'s", "")
    name = name.replace("'", "")
    name = re.sub(r'\(.*?\)', '', name) # Remove content in parentheses
    name = re.sub(r'\s+', ' ', name).strip() # Normalize spaces
    
    # Apply your specific mappings
    # (keep the rest of your existing mapping logic)
    
    return name

# Base directory
base_dir = "/Users/vvmohith/Desktop/PROJECT/dataset-final"

# Create a time series dataset from available budget files
def create_budget_time_series():
    """Create a standardized budget dataset with consistent ministry names"""
    
    # Find all year folders
    years = sorted([
        folder for folder in os.listdir(base_dir)
        if os.path.isdir(os.path.join(base_dir, folder)) 
        and re.match(r'\d{2}-\d{2}', folder)
    ])
    
    print(f"Found {len(years)} year folders")
    
    # Load the complete ministry mapping if available
    mapping_file = os.path.join(base_dir, "complete_ministry_mapping.csv")
    if os.path.exists(mapping_file):
        master_df = pd.read_csv(mapping_file)
        print(f"Loaded existing ministry mapping with {len(master_df)} ministries")
    else:
        # We'll need to build it from scratch
        print("No existing mapping found. Creating new standardized dataset...")
        
        # Process each year's data
        all_data = []
        year_columns = []
        
        for year_folder in years:
            # Find CSV file for this year
            year_path = os.path.join(base_dir, year_folder)
            csv_files = glob.glob(os.path.join(year_path, f"{year_folder}*.csv"))
            
            if not csv_files:
                print(f"No CSV found for {year_folder}, skipping")
                continue
                
            # Prefer files with "_with_share" if available
            share_files = [f for f in csv_files if '_with_share' in os.path.basename(f)]
            file_path = share_files[0] if share_files else csv_files[0]
            
            try:
                # Load data
                df = pd.read_csv(file_path)
                
                # Standardize column names
                ministry_col = df.columns[0]
                budget_col = df.columns[1]
                
                # Clean data
                year_data = pd.DataFrame({
                    'Ministry': df[ministry_col].apply(clean_ministry_name),
                    f'Budget_{year_folder}': pd.to_numeric(
                        df[budget_col].astype(str).str.replace(',', ''), 
                        errors='coerce'
                    )
                })
                
                # Remove grand totals and NaN budgets
                year_data = year_data[~year_data['Ministry'].str.contains('grand total|total', case=False, na=False)]
                year_data = year_data.dropna(subset=[f'Budget_{year_folder}'])
                
                all_data.append(year_data)
                year_columns.append(f'Budget_{year_folder}')
                print(f"Processed {year_folder}: Found {len(year_data)} ministries")
                
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
        
        # Create combined dataset
        if not all_data:
            print("No data processed. Exiting.")
            return None
        
        # Merge all years on ministry name
        master_df = all_data[0]
        for df in all_data[1:]:
            master_df = pd.merge(master_df, df, on='Ministry', how='outer')
    
    # Convert fiscal year columns to calendar years
    fiscal_to_calendar = {}
    for col in master_df.columns:
        if col.startswith('Budget_'):
            year_str = col.replace('Budget_', '')
            if len(year_str) == 5:  # Format: '05-06'
                start_year = int('20' + year_str[:2])
                fiscal_to_calendar[col] = start_year
    
    # Add calendar year columns
    for fiscal_col, calendar_year in fiscal_to_calendar.items():
        master_df[f'Year_{calendar_year}'] = master_df[fiscal_col]
    
    # Save the standardized dataset
    output_path = os.path.join(base_dir, "standardized_budget_time_series.csv")
    master_df.to_csv(output_path, index=False)
    print(f"Saved standardized budget time series to {output_path}")
    
    return master_df

# Run the function
budget_df = create_budget_time_series()

# Display results
if budget_df is not None:
    print("\nFirst few rows of standardized budget data:")
    print(budget_df.head())
    
    # Calculate completeness
    fiscal_cols = [col for col in budget_df.columns if col.startswith('Budget_')]
    completeness = budget_df[fiscal_cols].notna().mean().mean() * 100
    print(f"\nData completeness: {completeness:.2f}%")

Found 19 year folders
Loaded existing ministry mapping with 76 ministries
Saved standardized budget time series to /Users/vvmohith/Desktop/PROJECT/dataset-final/standardized_budget_time_series.csv

First few rows of standardized budget data:
                         Base_Ministry    05-06    06-07    07-08    09-10  \
0  Agricultural Research and Education  1942.00  2160.00  2460.00  3241.40   
1        Animal Husbandry and Dairying   710.69   819.86   955.00  1206.28   
2            Agro and Rural Industries   946.63  1054.63  1199.63   590.00   
3                        Atomic Energy  4995.86  5505.08  6130.00  7773.00   
4        Chemicals and Petro-Chemicals   913.25   169.40   257.00   265.15   

     10-11    12-13    13-14     15-16     16-17     18-19     19-20  \
0  3818.05  5392.00  5729.17   6320.00   6620.00   7800.00   8078.76   
1  1398.70  2009.37  2110.16   1227.43   1339.51   3100.00   3100.00   
2   566.14   698.44   991.08   1155.71   1213.49   1669.52   1497.94   
3