# Data Preprocessing

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import  fisk

In [11]:
income_data = pd.read_stata("income_survey.dta")
income_data = pd.DataFrame(income_data)
vehicle_data = pd.read_stata("vehicle_survey.dta")
vehicle_data = pd.DataFrame(vehicle_data)

..........

Income Data

..........

In [12]:
income = income_data.iloc[:, [0, 16, 39]].copy()
income = income[income['TOTAL_INCOME'] != -99]

# Obtain unique household IDs and count occurrences
unique_hh_ids = income['HH_ID'].unique()
repetition_count = income['HH_ID'].value_counts()

# Calculate total income sum and household size
total_income_sum = income.groupby('HH_ID')['TOTAL_INCOME'].sum()
hh_size = income.groupby('HH_ID')['TOTAL_MEM_IN_HH'].sum()

income_df = pd.DataFrame({
    'id': unique_hh_ids,
    'occurrence_count': repetition_count,
    'Total Income Sum': total_income_sum,
    'household_size': hh_size
})

income_df['per_capita_income'] = (income_df['Total Income Sum'] * 12) / income_df['household_size']
income_df['household_size'] = income_df['household_size'] / income_df['occurrence_count']

# Save the modified DataFrame to a CSV file
income_df = income_df[['id', 'per_capita_income','household_size']]
income_df.to_csv('output_1.csv', index=True)

..........

Car Stock Data

..........

In [16]:
four_wheeler = vehicle_data.iloc[:, [0, 22, 23]].copy()

unique_hh_ids = four_wheeler['HH_ID'].unique()
repetition_count = four_wheeler['HH_ID'].value_counts()
total_scooter_count = four_wheeler.groupby('HH_ID')['NO_OF_UNITS_OWNED_AS_OF_NOW_AS_9'].sum()
total_car_count = four_wheeler.groupby('HH_ID')['NO_OF_UNITS_OWNED_AS_OF_NOW_AS_8'].sum()

four_wheeler_df = pd.DataFrame({
    'id': unique_hh_ids,
    'total_car_count': total_car_count,
    'total_scooter_count': total_scooter_count,
    'occurrence_count': repetition_count
})

four_wheeler_df.loc[four_wheeler_df['total_scooter_count'] > 0, 'total_car_count'] = 0

# Save the modified DataFrame to a CSV file
four_wheeler_df.to_csv('output_3.csv', index=True)

..........

Scooter Stock Data

..........

In [18]:
two_wheeler = vehicle_data.iloc[:, [0, 22, 23]].copy()

unique_hh_ids = two_wheeler['HH_ID'].unique()
repetition_count = two_wheeler['HH_ID'].value_counts()
total_scooter_count = two_wheeler.groupby('HH_ID')['NO_OF_UNITS_OWNED_AS_OF_NOW_AS_9'].sum()
total_car_count = two_wheeler.groupby('HH_ID')['NO_OF_UNITS_OWNED_AS_OF_NOW_AS_8'].sum()

two_wheeler_df = pd.DataFrame({
    'id': unique_hh_ids,
    'total_car_count': total_car_count,
    'total_scooter_count': total_scooter_count,
    'occurrence_count': repetition_count
})

two_wheeler_df.loc[two_wheeler_df['total_car_count'] > 0, 'total_scooter_count'] = 0

# Save the modified DataFrame to a CSV file
two_wheeler_df.to_csv('output_2.csv', index=True)