In [1]:
import pandas as pd
import re

# Path to the input CSV file
input_file = '/Users/vvmohith/Desktop/PROJECT/11-12/11-12-actual.csv'

# Read the file as text
with open(input_file, 'r') as file:
    lines = file.readlines()

# Filter out lines containing "Revenue" or "Capital"
filtered_lines = [line for line in lines if not re.search(r'^Revenue,|^Capital,', line)]

# Path for the output file
output_file = '/Users/vvmohith/Desktop/PROJECT/11-12/11-12-actual-filtered.csv'

# Write the filtered content to the new file
with open(output_file, 'w') as file:
    file.writelines(filtered_lines)

print(f"Filtered file saved to {output_file}")

# You can also load the filtered data into a DataFrame if needed
data = pd.read_csv(output_file)
data.head()

Filtered file saved to /Users/vvmohith/Desktop/PROJECT/11-12/11-12-actual-filtered.csv


Unnamed: 0,Page : 4,Unnamed: 1
0,MINISTRY/DEPARTMENT,Total
1,I. CENTRAL SECTOR,1147726.19
2,Agriculture and Cooperation,9662.00
3,Agriculture Research and Education,4957.60
4,"Animal Husbandry, Dairying and Fisheries",1696.25


In [2]:
import pandas as pd
import re
import numpy as np

# Path to the input CSV file
input_file = '/Users/vvmohith/Desktop/PROJECT/17-18-actual.csv'

# Read the file as text
with open(input_file, 'r') as file:
    lines = file.readlines()

# Initialize lists to store the cleaned data
departments = []
allocations = []
current_dept = None

# Process each line
for line in lines:
    line = line.strip()
    
    # Skip empty lines
    if not line:
        continue
    
    # Check if line starts with a number (indicating a department)
    dept_match = re.match(r'^(\d+\.\s+.*?),$', line)
    if dept_match or re.match(r'^"(\d+\.\s+.*?)",$', line):
        current_dept = line.strip(',').strip('"')
    
    # Check if line contains a department name and value
    parts = line.split(',')
    if len(parts) >= 2 and parts[0] and parts[1] and parts[0][0] not in [',', ' ']:
        dept = parts[0].strip('"')
        # Try to convert allocation to a number
        try:
            alloc = float(parts[1].strip())
            departments.append(dept)
            allocations.append(alloc)
        except ValueError:
            # Skip if allocation isn't a valid number
            pass

# Create a dataframe with only departments and their total allocations
df = pd.DataFrame({
    'MINISTRY/DEPARTMENT': departments,
    'Total': allocations
})

# Filter out rows that don't have a department number
df = df[df['MINISTRY/DEPARTMENT'].str.contains(r'^\d+\.', regex=True) | (df['MINISTRY/DEPARTMENT'] == 'MINISTRY/DEPARTMENT')]

# Save the cleaned data
output_file = '/Users/vvmohith/Desktop/PROJECT/17-18-actual-cleaned.csv'
df.to_csv(output_file, index=False)

print(f"Cleaned file saved to {output_file}")

# Display the first few rows of the cleaned data
df.head(10)

Cleaned file saved to /Users/vvmohith/Desktop/PROJECT/17-18-actual-cleaned.csv


Unnamed: 0,MINISTRY/DEPARTMENT,Total
0,2. Department of Agricultural Research and Edu...,6800.0
1,4. Atomic Energy,12461.2
2,6. Department of Chemicals and Petrochemicals,298.0
3,7. Department of Fertilisers,70032.71
4,8. Department of Pharmaceuticals,247.74
5,9. Ministry of Civil Aviation,2702.0
6,10. Ministry of Coal,745.1
7,11. Department of Commerce,4465.83
8,12. Department of Industrial Policy and Promotion,3608.87
9,13. Department of Posts,9550.0


In [3]:
import pandas as pd
import re

# Path to the input CSV file
input_file = '/Users/vvmohith/Desktop/PROJECT/17-18-actual.csv'

# Read the file as text
with open(input_file, 'r') as file:
    lines = file.readlines()

# Initialize lists to store the cleaned data
departments = []
allocations = []

# Process each line
for i, line in enumerate(lines):
    line = line.strip()
    
    # Skip empty lines
    if not line:
        continue
    
    # Special handling for lines with department numbers
    if re.match(r'^(\d+\.\s+|"\d+\.\s+)', line):
        parts = line.split(',')
        if len(parts) >= 2:
            dept_name = parts[0].strip('"')
            try:
                # Try to get the allocation value
                alloc_value = float(parts[1].strip())
                departments.append(dept_name)
                allocations.append(alloc_value)
            except (ValueError, IndexError):
                # If there's an issue with the allocation, use the next line
                if i + 1 < len(lines) and lines[i+1].strip().startswith(','):
                    try:
                        alloc_value = float(lines[i+1].strip(','))
                        departments.append(dept_name)
                        allocations.append(alloc_value)
                    except ValueError:
                        pass
    
    # Special handling for Grand Total
    if line.startswith('Grand Total'):
        parts = line.split(',')
        if len(parts) >= 2:
            try:
                alloc_value = float(parts[1].strip())
                departments.append('Grand Total')
                allocations.append(alloc_value)
            except ValueError:
                pass

# Create a dataframe
df = pd.DataFrame({
    'MINISTRY/DEPARTMENT': departments,
    'Total': allocations
})

# Special check to ensure we have all departments
missing_depts = {
    '27': 'Ministry of Environment, Forests and Climate Change',
    '38': 'Repayment of Debt',
    '64': 'Ministry of Micro, Small and Medium Enterprises',
    '70': 'Ministry of Personnel, Public Grievances and Pensions', 
    '98': 'Ministry of Water Resources, River Development and Ganga Rejuvenation'
}

# Verify all departments are present (1-100)
dept_numbers = []
for dept in df['MINISTRY/DEPARTMENT']:
    match = re.match(r'^(\d+)\.', dept)
    if match:
        dept_numbers.append(int(match.group(1)))

for num in range(1, 101):
    if num not in dept_numbers:
        print(f"Department number {num} is missing")

# Save the cleaned data
output_file = '/Users/vvmohith/Desktop/PROJECT/17-18-actual-complete.csv'
df.to_csv(output_file, index=False)

print(f"Complete department list saved to {output_file}")

# Display the first few rows of the data
df.head()

Department number 1 is missing
Department number 38 is missing
Complete department list saved to /Users/vvmohith/Desktop/PROJECT/17-18-actual-complete.csv


Unnamed: 0,MINISTRY/DEPARTMENT,Total
0,2. Department of Agricultural Research and Edu...,6800.0
1,3. Department of Animal Husbandry,2034.7
2,4. Atomic Energy,12461.2
3,5. Ministry of Ayurveda,68.86
4,6. Department of Chemicals and Petrochemicals,298.0


In [4]:
import pandas as pd
import re

# Path to the input CSV file
input_file = '/Users/vvmohith/Desktop/PROJECT/2008-09.csv'

# Read the file as text
with open(input_file, 'r') as file:
    lines = file.readlines()

# Filter out lines starting with "Revenue," or "Capital,"
filtered_lines = []
for line in lines:
    if not line.strip().startswith('Revenue,') and not line.strip().startswith('Capital,'):
        filtered_lines.append(line)

# Path for the output file
output_file = '/Users/vvmohith/Desktop/PROJECT/2008-09-filtered.csv'

# Write the filtered content to the new file
with open(output_file, 'w') as file:
    file.writelines(filtered_lines)

print(f"Filtered file saved to {output_file}")

# You can also load the filtered data into a DataFrame if needed
try:
    data = pd.read_csv(output_file)
    print("Preview of the filtered data:")
    print(data.head())
except Exception as e:
    print(f"Note: Could not display data as DataFrame due to: {e}")
    print("This is expected if the CSV format is irregular, but the file is still created.")

Filtered file saved to /Users/vvmohith/Desktop/PROJECT/2008-09-filtered.csv
Preview of the filtered data:
                   MINISTRY/DEPARTMENT Unnamed: 1
0                                 Plan      Total
1            Agriculture & Cooperation    7528.78
2  Agricultural Research and Education    2680.00
3        Animal Husbandry and Dairying    1062.43
4                        Atomic Energy    4797.00
