In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import random

# Function to compute propensity scores
def compute_propensity_scores(df, relevant_columns):
    X = df[relevant_columns]
    model = LogisticRegression()
    model.fit(X, df['group'])
    scores = model.predict_proba(X)[:, 1]
    return scores

def propensity_score_matching(control, treatment, caliper):
    matched_treatment = pd.DataFrame()
    matched_treatment_indices = set()

    for index, control_row in control.iterrows():
        control_score = control_row['propensity_score']
        potential_matches = treatment[(treatment['propensity_score'] >= control_score - caliper) &
                                      (treatment['propensity_score'] <= control_score + caliper) &
                                      (~treatment.index.isin(matched_treatment_indices))
                                      ]
        if not potential_matches.empty:
            # Select up to 5 matches without replacement
            selected_matches = potential_matches.sample(n=min(5, len(potential_matches)), replace=False)
            matched_treatment = pd.concat([matched_treatment, selected_matches])
            matched_treatment_indices.update(selected_matches.index)

            # Add matched treatment indices to the set to avoid re-matching
            matched_treatment_indices.update(selected_matches.index)

    return matched_treatment

# Load the datasets
control_group_df = pd.read_csv(fr"E:\Propensity score matching analysis\TAZ_destination.csv")
treatment_group_df = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD.csv")

# # Remove null or NA values from CO_NAME column
# control_group_df.dropna(subset=['CO_NAME'], inplace=True)
# treatment_group_df.dropna(subset=['CO_NAME'], inplace=True)

# Unique values for emp_text column in control_group_df
control_emp_values = control_group_df['emp_text'].unique()
# Filtering treatment_group_df to have only those values in control_emp_values
treatment_group_df = treatment_group_df[treatment_group_df['emp_text'].isin(control_emp_values)]

# # Unique values for CO_NAME column in control_group_df
# control_CO_values = control_group_df['CO_NAME'].unique()
# # Filtering treatment_group_df to have only those values in control_emp_values
# treatment_group_df = treatment_group_df[treatment_group_df['CO_NAME'].isin(control_CO_values)]

# Keep only unique values in the 'hhmemberid' column
control_group_df = control_group_df.drop_duplicates(subset=['hhmemberid'])
treatment_group_df = treatment_group_df.drop_duplicates(subset=['hhmemberid'])

# define categorical variables
cat=['age', 'gender', 'education', 'licensed', 'emp_text']
# define continuous variables
con=['num_veh', 'workers', 'hhsize']

# Define relevant columns for the analysis
relevant_columns = cat+con

# Encode categorical variables
le = LabelEncoder()
for col in cat:
    if control_group_df[col].dtype == object:
        control_group_df[col] = le.fit_transform(control_group_df[col])
        treatment_group_df[col] = le.transform(treatment_group_df[col])

# Assign group labels
control_group_df = control_group_df.assign(group=0)
treatment_group_df = treatment_group_df.assign(group=1)

# Combine subsets for propensity score computation
combined_subset = pd.concat([control_group_df, treatment_group_df])
combined_subset['propensity_score'] = compute_propensity_scores(combined_subset, relevant_columns)

# Split the combined subset back into control and treatment with propensity scores
control_subset = combined_subset[combined_subset['group'] == 0]
treatment_subset = combined_subset[combined_subset['group'] == 1]

# Define caliper as 0.25 standard deviations of the propensity score
caliper = 0.25 * combined_subset['propensity_score'].std()

matched_subset = propensity_score_matching(control_subset, treatment_subset, caliper)

display(matched_subset)

In [None]:
# Calculate the standard deviation of the propensity scores in the combined matched dataset
propensity_score_std = matched_subset['propensity_score'].std()
print("Standard Deviation of Propensity Scores in the Matched Dataset:", propensity_score_std)

In [None]:
### Keep all matching hhmemberid from the initial treatment dataset
# Load the initial_treatment dataset
initial_treatment = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD.csv")
# Merge the two dataframes on hhmemberid
matching_rows = pd.merge(initial_treatment, matched_subset[['hhmemberid']], on='hhmemberid', how='inner')
print('number of trips after matching the person data is:',len(matching_rows))
# Save the combined matched dataset to a CSV file
matching_rows.to_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM1.csv", index=False)

In [None]:
#### Keeping only matching hptripids in origin and destination dataset
# Load the datasets
TAZ_origin_df = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD_PSM1.csv")
TAZ_destination_df = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM1.csv")

# Finding unique hptripid in each file
unique_origin = set(TAZ_origin_df['hptripid'].unique())
print("Number of unique hptripid in origin:", len(unique_origin))
unique_destination = set(TAZ_destination_df['hptripid'].unique())
print("Number of unique hptripid in destination:", len(unique_destination))

# Common hptripid in both origin and destination files
common_hptripid = unique_origin.intersection(unique_destination)
num_common_hptripid = len(common_hptripid)

# Keeping only common hptripid in both files
TAZ_origin_common = TAZ_origin_df[TAZ_origin_df['hptripid'].isin(common_hptripid)]
TAZ_destination_common = TAZ_destination_df[TAZ_destination_df['hptripid'].isin(common_hptripid)]
# Saving the matching hptripd origin and destination dataset data as a CSV file
TAZ_origin_common.to_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD_PSM2.csv", index=False)
TAZ_destination_common.to_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM2.csv", index=False)
# Print the results
print("Number of common hptripid in both files:", num_common_hptripid)

# Finding unique hhmemberid in each Final dataset after PSM mathching and matching common hptripids
# Load the datasets
TAZ_origin_df1 = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD_PSM2.csv")
TAZ_destination_df1 = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM2.csv")
unique_origin1 = set(TAZ_origin_df1['hhmemberid'].unique())
print("Number of unique hhmemberid in origin:", len(unique_origin1))
unique_destination2 = set(TAZ_destination_df1['hhmemberid'].unique())
print("Number of unique hhmemberid in destination:", len(unique_destination2))