In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import random

# Function to compute propensity scores
def compute_propensity_scores(df, relevant_columns):
    X = df[relevant_columns]
    model = LogisticRegression()
    model.fit(X, df['group'])
    scores = model.predict_proba(X)[:, 1]
    return scores

def propensity_score_matching(control, treatment, caliper):
    matched_treatment = pd.DataFrame()
    matched_treatment_indices = set()

    for index, control_row in control.iterrows():
        control_score = control_row['propensity_score']
        potential_matches = treatment[(treatment['propensity_score'] >= control_score - caliper) &
                                      (treatment['propensity_score'] <= control_score + caliper) &
                                      (~treatment.index.isin(matched_treatment_indices))
                                      ]
        if not potential_matches.empty:
            # Select up to 5 matches without replacement
            selected_matches = potential_matches.sample(n=min(5, len(potential_matches)), replace=False)
            matched_treatment = pd.concat([matched_treatment, selected_matches])
            matched_treatment_indices.update(selected_matches.index)

            # Add matched treatment indices to the set to avoid re-matching
            matched_treatment_indices.update(selected_matches.index)

    return matched_treatment

# Load the datasets
control_group_df = pd.read_csv(fr"E:\Propensity score matching analysis\TAZ_origin.csv")
treatment_group_df = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD.csv")

# # Remove null or NA values from CO_NAME column
# control_group_df.dropna(subset=['CO_NAME'], inplace=True)
# treatment_group_df.dropna(subset=['CO_NAME'], inplace=True)

# Unique values for emp_text column in control_group_df
control_emp_values = control_group_df['emp_text'].unique()
# Filtering treatment_group_df to have only those values in control_emp_values
treatment_group_df = treatment_group_df[treatment_group_df['emp_text'].isin(control_emp_values)]

# # Unique values for CO_NAME column in control_group_df
# control_CO_values = control_group_df['CO_NAME'].unique()
# # Filtering treatment_group_df to have only those values in control_emp_values
# treatment_group_df = treatment_group_df[treatment_group_df['CO_NAME'].isin(control_CO_values)]

# Keep only unique values in the 'hhmemberid' column
control_group_df = control_group_df.drop_duplicates(subset=['hhmemberid'])
treatment_group_df = treatment_group_df.drop_duplicates(subset=['hhmemberid'])

# define categorical variables
cat=['age', 'gender', 'education', 'licensed', 'emp_text']
# define continuous variables
con=['num_veh', 'workers', 'hhsize']

# Define relevant columns for the analysis
relevant_columns = cat+con

# Encode categorical variables
le = LabelEncoder()
for col in cat:
    if control_group_df[col].dtype == object:
        control_group_df[col] = le.fit_transform(control_group_df[col])
        treatment_group_df[col] = le.transform(treatment_group_df[col])

# Assign group labels
control_group_df = control_group_df.assign(group=0)
treatment_group_df = treatment_group_df.assign(group=1)

# Combine subsets for propensity score computation
combined_subset = pd.concat([control_group_df, treatment_group_df])
combined_subset['propensity_score'] = compute_propensity_scores(combined_subset, relevant_columns)

# Split the combined subset back into control and treatment with propensity scores
control_subset = combined_subset[combined_subset['group'] == 0]
treatment_subset = combined_subset[combined_subset['group'] == 1]

# Define caliper as 0.25 standard deviations of the propensity score
caliper = 0.25 * combined_subset['propensity_score'].std()

matched_subset = propensity_score_matching(control_subset, treatment_subset, caliper)

display(matched_subset)
#export it as csv
matched_subset.to_csv(fr"E:\STP_PWoD\Data\TAZ_PwoD_PSM_hhmember.csv", index=False)

Unnamed: 0,hptripid,tripID,hhmemberid,o_purp_t,depart_hhm,mode_t,hhsize,workers,hh_income,num_veh,...,d_CoTAZID_v30,ACRES,DEVACRES,DEVPBLEPCT,X,Y,CO_NAME,CITY_NAME,group,propensity_score
22717,W59245FU.01.01,1,W59245FU.01,home,600,auto,7,1,-1,2,...,110160,80.420980,80.420980,1.0,418233.184712,4.544694e+06,DAVIS,Layton,1,0.994008
3033,M19607WR.01.01,1,M19607WR.01,home,1050,auto,3,3,-1,3,...,490359,206.599606,206.599606,1.0,429023.318978,4.474159e+06,UTAH,Lehi,1,0.995202
25132,W67581HT.01.01,1,W67581HT.01,home,1800,auto,2,2,3,2,...,570279,165.725154,165.725154,1.0,420098.405715,4.561620e+06,WEBER,Ogden,1,0.995978
26926,W74129NS.01.01,1,W74129NS.01,home,810,auto,2,2,4,2,...,350124,97.783061,97.783061,1.0,429192.027786,4.507498e+06,SALT LAKE,Salt Lake City,1,0.995822
16610,W38073MU.01.01,1,W38073MU.01,home,640,auto,1,1,5,2,...,350729,85.055080,85.055080,1.0,425655.822985,4.504744e+06,SALT LAKE,South Salt Lake,1,0.990316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27312,W75664TR.02.01,1,W75664TR.02,home,755,auto,2,1,4,1,...,350501,111.135645,111.135645,1.0,429165.308640,4.506920e+06,SALT LAKE,Salt Lake City,1,0.990423
23430,W61733SZ.01.01,1,W61733SZ.01,home,745,auto,2,2,5,2,...,350272,43.265705,43.265705,1.0,426845.851127,4.513154e+06,SALT LAKE,Salt Lake City,1,0.995822
23729,W62747DT.02.01,1,W62747DT.02,home,700,auto,3,2,-1,4,...,570396,409.555912,409.555912,1.0,419789.516914,4.540563e+06,DAVIS,Kaysville,1,0.993281
5155,M25782NC.05.01,1,M25782NC.05,home,945,auto,6,1,1,5,...,490725,31.125857,31.125857,1.0,445147.585421,4.454523e+06,UTAH,Provo,1,0.995234


In [2]:
# Calculate the standard deviation of the propensity scores in the combined matched dataset
propensity_score_std = matched_subset['propensity_score'].std()
print("Standard Deviation of Propensity Scores in the Matched Dataset:", propensity_score_std)

Standard Deviation of Propensity Scores in the Matched Dataset: 0.11174607603567492


In [3]:
### Keep all matching hhmemberid from the initial treatment dataset
# Load the initial_treatment dataset
initial_treatment_orig = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD.csv")
initial_treatment_dest = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD.csv")
# Merge the two dataframes on hhmemberid
matching_rows_orig = pd.merge(initial_treatment_orig, matched_subset[['hhmemberid']], on='hhmemberid', how='inner')
matching_rows_dest = pd.merge(initial_treatment_dest, matched_subset[['hhmemberid']], on='hhmemberid', how='inner')
print('number of trips after matching the person data is:',len(matching_rows_orig))
# Save the combined matched dataset to a CSV file
matching_rows_orig.to_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD_PSM1.csv", index=False)
matching_rows_dest.to_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM1.csv", index=False)

number of trips after matching the person data is: 1923


In [4]:
#### Keeping only matching hptripids in origin and destination dataset
# Load the datasets
TAZ_origin_df = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD_PSM1.csv")
TAZ_destination_df = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM1.csv")

# Finding unique hptripid in each file
unique_origin = set(TAZ_origin_df['hptripid'].unique())
print("Number of unique hptripid in origin:", len(unique_origin))
unique_destination = set(TAZ_destination_df['hptripid'].unique())
print("Number of unique hptripid in destination:", len(unique_destination))

# Common hptripid in both origin and destination files
common_hptripid = unique_origin.intersection(unique_destination)
num_common_hptripid = len(common_hptripid)


Number of unique hptripid in origin: 1923
Number of unique hptripid in destination: 1923


In [5]:

# Keeping only common hptripid in both files
TAZ_origin_common = TAZ_origin_df[TAZ_origin_df['hptripid'].isin(common_hptripid)]
TAZ_destination_common = TAZ_destination_df[TAZ_destination_df['hptripid'].isin(common_hptripid)]
# Saving the matching hptripd origin and destination dataset data as a CSV file
TAZ_origin_common.to_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD_PSM2.csv", index=False)
TAZ_destination_common.to_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM2.csv", index=False)
# Print the results
print("Number of common hptripid in both files:", num_common_hptripid)

# Finding unique hhmemberid in each Final dataset after PSM mathching and matching common hptripids
# Load the datasets
TAZ_origin_df1 = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_origin_PwoD_PSM2.csv")
TAZ_destination_df1 = pd.read_csv(fr"E:\STP_PWoD\Data\TAZ_destination_PwoD_PSM2.csv")
unique_origin1 = set(TAZ_origin_df1['hhmemberid'].unique())
print("Number of unique hhmemberid in origin:", len(unique_origin1))
unique_destination2 = set(TAZ_destination_df1['hhmemberid'].unique())
print("Number of unique hhmemberid in destination:", len(unique_destination2))

Number of common hptripid in both files: 1923
Number of unique hhmemberid in origin: 730
Number of unique hhmemberid in destination: 730
