In [None]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import random

# Load the datasets
control_group_df = pd.read_csv(fr"E:\Propensity score matching analysis\TAZ_destination.csv")
treatment_group_df = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD.csv")
print('persons in control group:', control_group_df['hhmemberid'].nunique())
print('persons in treatment group:', treatment_group_df['hhmemberid'].nunique())

# counting number of trips in each of the counties CO_NAME
value_counts = control_group_df['CO_NAME'].value_counts()
for name, count in value_counts.items():
    print(f"{name}: {count}")

# Selecting relevant columns
relevant_columns = ['age', 'gender', 'education','licensed','num_veh','workers','hhsize','emp_text','CO_NAME'] # 'hh_income'
# Print unique values in each column for both groups
print("Unique values in Control Group:")
for col in relevant_columns:
    print(f"{col}: {control_group_df[col].unique()}")

print("\nUnique values in Treatment Group:")
for col in relevant_columns:
    print(f"{col}: {treatment_group_df[col].unique()}")

# Unique values in control_group_df
control_emp_values = control_group_df['emp_text'].unique()
# Filtering treatment_group_df to have only those values in control_emp_values
treatment_group_df = treatment_group_df[treatment_group_df['emp_text'].isin(control_emp_values)]

# Unique counties values in control_group_df
control_county_values = control_group_df['CO_NAME'].unique()
# Filtering treatment_group_df to have only those values in control_county_values
treatment_group_df = treatment_group_df[treatment_group_df['CO_NAME'].isin(control_county_values)]

# # Filter out rows where 'hh_income' is -1 in both datasets
# control_group_df = control_group_df[control_group_df['hh_income'] != -1]
# treatment_group_df = treatment_group_df[treatment_group_df['hh_income'] != -1]

control_group_df = control_group_df[relevant_columns]
treatment_group_df = treatment_group_df[relevant_columns]

# Encoding categorical variables if necessary
le = LabelEncoder()
if control_group_df['gender'].dtype == object:
    control_group_df['gender'] = le.fit_transform(control_group_df['gender'])
    treatment_group_df['gender'] = le.transform(treatment_group_df['gender'])
if control_group_df['education'].dtype == object:
    control_group_df['education'] = le.fit_transform(control_group_df['education'])
    treatment_group_df['education'] = le.transform(treatment_group_df['education'])
if control_group_df['emp_text'].dtype == object:
    control_group_df['emp_text'] = le.fit_transform(control_group_df['emp_text'])
    treatment_group_df['emp_text'] = le.transform(treatment_group_df['emp_text'])
if control_group_df['licensed'].dtype == object:
    control_group_df['licensed'] = le.fit_transform(control_group_df['licensed'])
    treatment_group_df['licensed'] = le.transform(treatment_group_df['licensed'])
if control_group_df['age'].dtype == object:
    control_group_df['age'] = le.fit_transform(control_group_df['age'])
    treatment_group_df['age'] = le.transform(treatment_group_df['age'])
if control_group_df['CO_NAME'].dtype == object:
    control_group_df['CO_NAME'] = le.fit_transform(control_group_df['CO_NAME'])
    treatment_group_df['CO_NAME'] = le.transform(treatment_group_df['CO_NAME'])
    
# Preparing the data for propensity score estimation
combined_df = pd.concat([treatment_group_df.assign(group=1), control_group_df.assign(group=0)])
combined_df
X = combined_df[relevant_columns]
X 

y = combined_df['group']
y

# Checking the balance before matching
print("Mean values of covariates before matching:")
print(combined_df.groupby('group').mean())

def calculate_standardized_difference(df1, df2, covariates):
    results = []
    for covariate in covariates:
        mean1 = df1[covariate].mean()
        mean2 = df2[covariate].mean()
        std1 = df1[covariate].std()
        std2 = df2[covariate].std()
        pooled_std = ((std1**2 + std2**2) / 2)**0.5
        std_diff = (mean1 - mean2) / pooled_std
        results.append({'Covariate': covariate, 'Standardized Difference': std_diff})
    return pd.DataFrame(results)

# # Assuming you have two dataframes df1 and df2, and a list of covariates
# covariates = relevant_columns
# standardized_differences = calculate_standardized_difference(control_group_df, treatment_group_df, covariates)
# print('****************************************')
# print('Before matching standardized_differences',standardized_differences)
# print('****************************************')

# Logistic regression model for propensity score
log_reg_model = LogisticRegression()
log_reg_model.fit(X, y)
combined_df['propensity_score'] = log_reg_model.predict_proba(X)[:, 1]

# Separating the treatment and control groups
treatment_df = combined_df[combined_df['group'] == 1]
control_df = combined_df[combined_df['group'] == 0]

# Relaxed criteria for matching
caliper = 0.50 * combined_df['propensity_score'].std()
expanded_matches = []
for index, control_row in control_df.iterrows():
    control_score = control_row['propensity_score']
    potential_matches = treatment_df[(treatment_df['propensity_score'] >= control_score - caliper) &
                                     (treatment_df['propensity_score'] <= control_score + caliper)]
    selected_matches = potential_matches.sample(n=min(5, len(potential_matches)), replace=True, random_state=index)
    expanded_matches.extend(selected_matches.values.tolist())

expanded_matched_treatment_df = pd.DataFrame(expanded_matches, columns=treatment_df.columns)

# Combining matched treatment and control groups for balance check
matched_df = pd.concat([expanded_matched_treatment_df, control_df])

# Checking the balance: compare the mean of the covariates in the matched groups
balance_check = matched_df.groupby('group').mean()
print(balance_check)

# Resulting dataset size
print("Size of Expanded Treatment Group:", len(expanded_matched_treatment_df))
print("Size of Control Group:", len(control_df))

In [None]:
# Assuming the original indices from the treatment dataset were preserved in the propensity score matching process

# Reading the original treatment dataset again (for full columns)
original_treatment_df = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD.csv")

# Mapping the expanded matched treatment data to the original treatment data
# Using the indices from the expanded_matched_treatment_df to retrieve the corresponding rows
full_data_matched_treatment_df = original_treatment_df.iloc[expanded_matched_treatment_df.index]

# Saving the full data matched treatment DataFrame as a CSV file
full_data_matched_treatment_df.to_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM1.csv", index=False)
print('persons in treatment group:', full_data_matched_treatment_df['hhmemberid'].nunique())
full_data_matched_treatment_df.head()  # Displaying the first few rows for verification

# standardized_differences_aftermatch = calculate_standardized_difference(control_group_df, full_data_matched_treatment_df, covariates)
# print('****************************************')
# print('After matching standardized_differences',standardized_differences_aftermatch)
# print('****************************************')

In [None]:
value_counts = full_data_matched_treatment_df['CO_NAME'].value_counts()

for name, count in value_counts.items():
    print(f"{name}: {count}")

#### Keeping only matching hptripids in origin and destination dataset

In [None]:
# Load the datasets
TAZ_origin_df = pd.read_csv(rf'E:\Propensity score matching analysis\TAZ_origin_PwoD_PSM.csv')
TAZ_destination_df = pd.read_csv(rf'E:\Propensity score matching analysis\TAZ_destination_PwoD_PSM.csv')

# Finding unique hptripid in each file
unique_origin = set(TAZ_origin_df['hptripid'].unique())
unique_destination = set(TAZ_destination_df['hptripid'].unique())
# Common hptripid in both files
common_hptripid = unique_origin.intersection(unique_destination)
num_common_hptripid = len(common_hptripid)

# Keeping only common hptripid in both files
TAZ_origin_common = TAZ_origin_df[TAZ_origin_df['hptripid'].isin(common_hptripid)]
TAZ_destination_common = TAZ_destination_df[TAZ_destination_df['hptripid'].isin(common_hptripid)]
# Saving the matching hptripd origin and destination dataset data as a CSV file
TAZ_origin_common.to_csv('TAZ_origin_PwoD_PSM1.csv', index=False)
TAZ_destination_common.to_csv('TAZ_destination_PwoD_PSM1.csv', index=False)
# Print the results
print("Number of common hptripid in both files:", num_common_hptripid)