In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, roc_auc_score
from sklearn.preprocessing import label_binarize

# Model Development

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Read in dataframe and view first few rows
path = '/content/drive/MyDrive/MLProject/data/df_clean.csv'
df = pd.read_csv(path)
df.head()

Mounted at /content/drive


Unnamed: 0,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,number_of_reviews,bedrooms,days_between_reviews,...,property_type_Other,property_type_Townhouse,room_type_Private room,room_type_Shared room,cancellation_policy_moderate,cancellation_policy_strict,neighbourhood_group_Low Frequency,neighbourhood_group_Medium Frequency,rating_bucket,text
0,0.0579,-0.328183,1,1,1,-0.025213,0,-0.593703,-0.223636,-0.785774,...,0,0,0,0,0,1,0,1,2,"Beautiful brownstone 1-bedroom Beautiful, sunl..."
1,2.236346,-0.328183,1,1,0,0.42424,1,-0.478216,2.558629,-0.748493,...,0,0,0,0,0,1,0,0,1,Superb 3BR Apt Located Near Times Square Enjoy...
2,-0.486712,-0.328183,1,1,1,0.42424,0,1.716046,-0.223636,1.700277,...,0,0,1,0,0,1,0,0,1,Large East Village Bedroom To Let! This is a b...
3,-0.486712,0.949421,1,1,1,0.42424,0,0.994249,-0.223636,0.554378,...,0,0,1,0,0,1,0,1,0,THE LIBRARY LOUNGE Cozy room in my big private...
4,1.691735,-0.328183,1,1,1,0.42424,0,-0.247241,2.558629,-0.485564,...,0,0,0,0,0,1,0,1,2,JFK LUXURIOUS APARTMENT My place is close to J...


In [None]:
df.columns

Index(['accommodates', 'bathrooms', 'cleaning_fee', 'host_has_profile_pic',
       'host_identity_verified', 'host_response_rate', 'instant_bookable',
       'number_of_reviews', 'bedrooms', 'days_between_reviews',
       'price_per_capacity', 'host_days_active', 'distance_to_times_square',
       'room_capacity', 'price_bucket', 'wireless_internet', 'kitchen',
       'heating', 'essentials', 'air_conditioning', 'smoke_detector', 'tv',
       'hangers', 'carbon_monoxide_detector', 'shampoo',
       'property_type_Condominium', 'property_type_House',
       'property_type_Loft', 'property_type_Other', 'property_type_Townhouse',
       'room_type_Private room', 'room_type_Shared room',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'neighbourhood_group_Low Frequency',
       'neighbourhood_group_Medium Frequency', 'rating_bucket', 'text'],
      dtype='object')

In [None]:
from os import rename
#We will make price_bucket our treatment field
rename_dict = {'price_bucket': 'treatment'}
df = df.rename(columns=rename_dict)

In [None]:
#Get a list of all columns except 'treatment' and 'rating_bucket'
columns = [col for col in df.columns if col not in ['treatment', 'rating_bucket']]

#Reorder the columns: all data fields first, then 'treatment', and 'rating_bucket' last
new_column_order = columns + ['treatment', 'rating_bucket']

#Apply the new column order
df = df[new_column_order]

In [None]:
df.drop(columns=['price_per_capacity'], inplace=True)

# Build Counterfactual Estimation Models

## Part 1: Create Treatment 0 Model (Low Price)

In [None]:
treatment0 = df[df['treatment'] == 0]

In [None]:
#Drop the 'treatment' column
treatment0 = treatment0.drop(columns=['treatment'])

## Tabular Data Development

In [None]:
X = treatment0.drop('rating_bucket', axis=1)

#Features (exclude the target column)
y_treatment0 = treatment0['rating_bucket']

In [None]:
#Prepare data
X_tabular = X.drop(columns=['text'])

#Initialize the XGBoost classifier
xgb_tabular = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth=5, n_estimators=100)

#Fit the model
xgb_tabular.fit(X_tabular, y_treatment0)

#Make predictions
y_pred_tabular = xgb_tabular.predict(X_tabular)
accuracy_tabular = accuracy_score(y_treatment0, y_pred_tabular)
print(f"Accuracy (Tabular Only): {accuracy_tabular}")

#Precision, Recall, and Classification Report
print("\nClassification Report:")
print(classification_report(y_treatment0, y_pred_tabular))

#Compute Precision and Recall (macro-averaged)
precision_tabular = precision_score(y_treatment0, y_pred_tabular, average='macro')
recall_tabular = recall_score(y_treatment0, y_pred_tabular, average='macro')
print(f"Precision (Tabular Only): {precision_tabular}")
print(f"Recall (Tabular Only): {recall_tabular}")

#Compute ROC AUC (requires one-hot encoded y_test and probabilities)
y_binarized = label_binarize(y_treatment0, classes=[0, 1, 2])
y_pred_proba = xgb_tabular.predict_proba(X_tabular)
roc_auc_tabular = roc_auc_score(y_binarized, y_pred_proba, multi_class='ovr')
print(f"ROC AUC (Tabular Only): {roc_auc_tabular}")

Accuracy (Tabular Only): 0.7423052763819096

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.33      0.48      1529
           1       0.75      0.91      0.82      2982
           2       0.69      0.82      0.75      1857

    accuracy                           0.74      6368
   macro avg       0.78      0.69      0.68      6368
weighted avg       0.77      0.74      0.72      6368

Precision (Tabular Only): 0.7768603968633391
Recall (Tabular Only): 0.6850958173165193
ROC AUC (Tabular Only): 0.8987661565119422


## Text Data Development

In [None]:
#Testing textual analysis
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
# Function to extract BERT embeddings
def get_bert_embeddings(texts, tokenizer, bert_model):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Move inputs to GPU
    inputs = {key: val.to(device) for key, val in inputs.items()}

    bert_model.eval()
    outputs = bert_model(inputs['input_ids'])
    # Use the mean of the last hidden state as the embeddings
    return np.mean(outputs.last_hidden_state.detach().cpu().numpy(), axis=1).squeeze()



In [None]:
def create_embeddings(df):
    """
    Parameters::
        df: DataFrame with a column named "text"

    Returns::
        emb_df: DataFrame with 768 columns; each row contains the embeddings for the text in the corresponding row of df.
    """
    embeddings = []

    # Loop through the rows of the dataframe. Pass the text through the bert model and get embeddings using the get_bert_embeddings function
    for i in tqdm(range(0, df.shape[0])):
        text = df['text'].iloc[i] # TODO
        full_embedding = get_bert_embeddings(texts = text, tokenizer= tokenizer, bert_model= bert_model) # TODO
        embeddings.append(full_embedding) # TODO (or = full_embedding?)

    emb_df =  pd.DataFrame(np.array(embeddings), columns=[f"emb_{i}" for i in range(768)])

    emb_df = emb_df.set_index(df.index)

    return emb_df

In [None]:
#Apply BERT embeddings on train and test text data
X_text_bert = create_embeddings(X)

#XGBoost on text data
xgb_text = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth = 5, n_estimators=100)

#Fit the model
xgb_text.fit(X_text_bert, y_treatment0)

#Evaluate on test set
y_pred_text = xgb_text.predict(X_text_bert)
accuracy_text = accuracy_score(y_treatment0, y_pred_text)
print(f"Accuracy (Text Only): {accuracy_text}")

100%|██████████| 6368/6368 [01:55<00:00, 55.11it/s]


Accuracy (Text Only): 0.9442525125628141


In [None]:
#Combine tabular features and BERT embeddings
X_combined = pd.concat([X_tabular, X_text_bert], axis = 1)

#XGBoost on combined data
xgb_combined = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth = 5, n_estimators=100)

#Fit the model
xgb_combined.fit(X_combined, y_treatment0)

#Evaluate on test set
y_pred_combined = xgb_combined.predict(X_combined)
accuracy_combined = accuracy_score(y_treatment0, y_pred_combined)
print(f"Accuracy (Combined): {accuracy_combined}")

Accuracy (Combined): 0.9327889447236181


## Counterfactual Estimation for Treatments 1 and 2

In [None]:
#Counterfactual estimation for treatment 1
treatment1 = df[df['treatment'] == 1]
treatment1 = treatment1.drop(columns=['treatment', 'rating_bucket'])

In [None]:
treatment1_bert = create_embeddings(treatment1)

100%|██████████| 6910/6910 [02:04<00:00, 55.38it/s]


In [None]:
#Drop text field
treatment1 = treatment1.drop(columns = ['text'])
#Combine tabular features and BERT embeddings
X_combined_1 = pd.concat([treatment1, treatment1_bert], axis = 1)

In [None]:
treatment1_counterfactual0 = xgb_combined.predict(X_combined_1)

In [None]:
treatment1_counterfactual0

array([2, 1, 1, ..., 2, 1, 1], dtype=int32)

In [None]:
unique, counts = np.unique(treatment1_counterfactual0, return_counts=True)

# Combine into a dictionary for better readability
result = dict(zip(unique, counts))
print(result)

{0: 209, 1: 4650, 2: 2051}


In [None]:
#Counterfactual estimation for treatment 2
treatment2 = df[df['treatment'] == 2]
treatment2 = treatment2.drop(columns=['treatment', 'rating_bucket'])

In [None]:
#Get textual embeddings
treatment2_bert = create_embeddings(treatment2)

100%|██████████| 5553/5553 [01:41<00:00, 54.67it/s]


In [None]:
#Drop text field
treatment2 = treatment2.drop(columns = ['text'])
#Combine tabular features and BERT embeddings
X_combined_2 = pd.concat([treatment2, treatment2_bert], axis = 1)

In [None]:
treatment2_counterfactual0 = xgb_combined.predict(X_combined_2)

In [None]:
treatment2_counterfactual0

array([1, 2, 2, ..., 1, 1, 1], dtype=int32)

In [None]:
unique, counts = np.unique(treatment2_counterfactual0, return_counts=True)

# Combine into a dictionary for better readability
result = dict(zip(unique, counts))
print(result)

{0: 164, 1: 3899, 2: 1490}


## Part 2: Create Treatment 1 Model (Medium Price)

In [None]:
#Build treatment1 model
treatment1 = df[df['treatment'] == 1]

In [None]:
treatment1["rating_bucket"].value_counts()

Unnamed: 0_level_0,count
rating_bucket,Unnamed: 1_level_1
1,3711
2,1965
0,1234


In [None]:
treatment1 = treatment1.drop(columns=['treatment'])

In [None]:
from sklearn.model_selection import train_test_split

X_treatment1 = treatment1.drop('rating_bucket', axis=1)

#Features (exclude the target column)
y_treatment1 = treatment1['rating_bucket']

In [None]:
#Prepare data
X_treatment1_tabular = X_treatment1.drop(columns=['text'])

#Initialize the XGBoost classifier
xgb_treatment1_tabular = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth=5, n_estimators=100)

#Fit the model
xgb_treatment1_tabular.fit(X_treatment1_tabular, y_treatment1)

#Make predictions
y_pred_tabular_treatment1 = xgb_treatment1_tabular.predict(X_treatment1_tabular)
accuracy_tabular = accuracy_score(y_treatment1, y_pred_tabular_treatment1)
print(f"Training Accuracy (Tabular Only): {accuracy_tabular}")

#Precision, Recall, and Classification Report
print("\nClassification Report:")
print(classification_report(y_treatment1, y_pred_tabular_treatment1))

#Compute Precision and Recall (macro-averaged)
precision_tabular = precision_score(y_treatment1, y_pred_tabular_treatment1, average='macro')
recall_tabular = recall_score(y_treatment1, y_pred_tabular_treatment1,  average='macro')
print(f"Precision (Tabular Only): {precision_tabular}")
print(f"Recall (Tabular Only): {recall_tabular}")

#Compute ROC AUC (requires one-hot encoded y_test and probabilities)
y_test_binarized = label_binarize(y_treatment1, classes=[0, 1, 2])
y_pred_proba = xgb_treatment1_tabular.predict_proba(X_treatment1_tabular)
roc_auc_tabular = roc_auc_score(y_test_binarized, y_pred_proba, multi_class='ovr')
print(f"ROC AUC (Tabular Only): {roc_auc_tabular}")

Training Accuracy (Tabular Only): 0.7536903039073806

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.20      0.33      1234
           1       0.77      0.92      0.84      3711
           2       0.70      0.78      0.74      1965

    accuracy                           0.75      6910
   macro avg       0.80      0.64      0.64      6910
weighted avg       0.78      0.75      0.72      6910

Precision (Tabular Only): 0.7979311290861187
Recall (Tabular Only): 0.6354061975187513
ROC AUC (Tabular Only): 0.9000979680020874


In [None]:
#Apply BERT embeddings on train and test text data
X_treatment1_text_bert = create_embeddings(X_treatment1)

#XGBoost on text data
xgb_treatment1_text = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth = 5, n_estimators=100)

#Fit the model
xgb_treatment1_text.fit(X_treatment1_text_bert, y_treatment1)

#Evaluate on test set
y_pred_treatment1_text = xgb_treatment1_text.predict(X_treatment1_text_bert)
accuracy_text = accuracy_score(y_treatment1, y_pred_treatment1_text)
print(f"Accuracy (Text Only): {accuracy_text}")

100%|██████████| 6910/6910 [02:06<00:00, 54.76it/s]


Accuracy (Text Only): 0.9030390738060782


In [None]:
#Combine tabular features and BERT embeddings
X_treatment1_combined = pd.concat([X_treatment1_tabular, X_treatment1_text_bert], axis = 1)

#XGBoost on combined data
xgb_combined = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth = 5, n_estimators=100)

#Fit the model
xgb_combined.fit(X_treatment1_combined, y_treatment1)

#Evaluate on test set
y_pred_combined = xgb_combined.predict(X_treatment1_combined)
accuracy_combined = accuracy_score(y_treatment1, y_pred_combined)
print(f"Accuracy (Combined): {accuracy_combined}")

Accuracy (Combined): 0.9219971056439942


In [None]:
#Find counterfactuals for treatment 0
treatment0 = df[df['treatment'] == 0]
treatment0 = treatment0.drop(columns=['treatment', 'rating_bucket'])

In [None]:
#Extract text embeddings
treatment0_bert = create_embeddings(treatment0)

100%|██████████| 6368/6368 [01:53<00:00, 56.15it/s]


In [None]:
#Concatenate tabular and text data
treatment0 = treatment0.drop(columns=['text'])
X_combined_0 = pd.concat([treatment0, treatment0_bert], axis = 1)

In [None]:
treatment0_counterfactual1 = xgb_combined.predict(X_combined_0)

In [None]:
unique, counts = np.unique(treatment0_counterfactual1, return_counts=True)
result = dict(zip(unique, counts))
print(result)

{0: 118, 1: 3844, 2: 2406}


In [None]:
#Estimate counterfactuals for treatment 2
treatment2 = df[df['treatment'] == 2]
treatment2 = treatment2.drop(columns=['treatment', 'rating_bucket'])

In [None]:
treatment2_bert = create_embeddings(treatment2)

100%|██████████| 5553/5553 [01:41<00:00, 54.45it/s]


In [None]:
treatment2 = treatment2.drop(columns=['text'])
X_combined_2 = pd.concat([treatment2, treatment2_bert], axis = 1)

In [None]:
treatment2_counterfactual1 = xgb_combined.predict(X_combined_2)

In [None]:
unique, counts = np.unique(treatment2_counterfactual1, return_counts=True)
result = dict(zip(unique, counts))
print(result)

{0: 73, 1: 3718, 2: 1762}


## Part 3: Create Treatment 2 Model (High Price)

In [None]:
#Create treatment 2 df
treatment2 = df[df['treatment'] == 2]

In [None]:
#Build treatment2 model
treatment2["rating_bucket"].value_counts()

Unnamed: 0_level_0,count
rating_bucket,Unnamed: 1_level_1
1,3023
2,1621
0,909


In [None]:
#Drop treatment
treatment2 = treatment2.drop(columns=['treatment'])

In [None]:
#Drop target for X data
X_treatment2 = treatment2.drop('rating_bucket', axis=1)

#Features (exclude the target column)
y_treatment2 = treatment2['rating_bucket']

In [None]:
#Prepare data
X_treatment2_tabular = X_treatment2.drop(columns=['text'])

#Initialize the XGBoost classifier
xgb_treatment2_tabular = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth=5, n_estimators=100)

#Fit the model
xgb_treatment2_tabular.fit(X_treatment2_tabular, y_treatment2)

#Make predictions
y_pred_tabular_treatment2 = xgb_treatment2_tabular.predict(X_treatment2_tabular)
accuracy_tabular = accuracy_score(y_treatment2, y_pred_tabular_treatment2)
print(f"Training Accuracy (Tabular Only): {accuracy_tabular}")

#Precision, Recall, and Classification Report
print("\nClassification Report:")
print(classification_report(y_treatment2, y_pred_tabular_treatment2))

#Compute Precision and Recall (macro-averaged)
precision_tabular = precision_score(y_treatment2, y_pred_tabular_treatment2, average='macro')
recall_tabular = recall_score(y_treatment2, y_pred_tabular_treatment2,  average='macro')
print(f"Precision (Tabular Only): {precision_tabular}")
print(f"Recall (Tabular Only): {recall_tabular}")

#Compute ROC AUC (requires one-hot encoded y_test and probabilities)
y_test_binarized = label_binarize(y_treatment2, classes=[0, 1, 2])
y_pred_proba = xgb_treatment1_tabular.predict_proba(X_treatment2_tabular)
roc_auc_tabular = roc_auc_score(y_test_binarized, y_pred_proba, multi_class='ovr')
print(f"ROC AUC (Tabular Only): {roc_auc_tabular}")

Training Accuracy (Tabular Only): 0.7842607599495768

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.29      0.45       909
           1       0.79      0.93      0.85      3023
           2       0.74      0.79      0.77      1621

    accuracy                           0.78      5553
   macro avg       0.83      0.67      0.69      5553
weighted avg       0.80      0.78      0.76      5553

Precision (Tabular Only): 0.8305881988586415
Recall (Tabular Only): 0.6711591420618116
ROC AUC (Tabular Only): 0.785181105972255


In [None]:
#Apply BERT embeddings on train and test text data
X_treatment2_text_bert = create_embeddings(X_treatment2)

#XGBoost on text data
xgb_treatment2_text = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth = 5, n_estimators=100)

#Fit the model
xgb_treatment2_text.fit(X_treatment2_text_bert, y_treatment2)

#Evaluate on test set
y_pred_treatment2_text = xgb_treatment2_text.predict(X_treatment2_text_bert)
accuracy_text = accuracy_score(y_treatment2, y_pred_treatment2_text)
print(f"Accuracy (Text Only): {accuracy_text}")

100%|██████████| 5553/5553 [01:43<00:00, 53.81it/s]


Accuracy (Text Only): 0.9481361426256077


In [None]:
#Combine tabular features and BERT embeddings
X_treatment2_combined = pd.concat([X_treatment2_tabular, X_treatment2_text_bert], axis = 1)

#XGBoost on combined data
xgb_combined = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eta=0.1, max_depth = 5, n_estimators=100)

#Fit the model
xgb_combined.fit(X_treatment2_combined, y_treatment2)

#Evaluate on test set
y_pred_combined = xgb_combined.predict(X_treatment2_combined)
accuracy_combined = accuracy_score(y_treatment2, y_pred_combined)
print(f"Accuracy (Combined): {accuracy_combined}")

Accuracy (Combined): 0.9609220241311003


In [None]:
#Find counterfactuals for treatment 0
treatment0 = df[df['treatment'] == 0]
treatment0 = treatment0.drop(columns=['treatment', 'rating_bucket'])

In [None]:
#Extract text embeddings
treatment0_bert = create_embeddings(treatment0)

100%|██████████| 6368/6368 [01:53<00:00, 56.03it/s]


In [None]:
#Concatenate tabular and text data
treatment0 = treatment0.drop(columns=['text'])
X_combined_0 = pd.concat([treatment0, treatment0_bert], axis = 1)

In [None]:
treatment0_counterfactual2 = xgb_combined.predict(X_combined_0)

In [None]:
unique, counts = np.unique(treatment0_counterfactual2, return_counts=True)
result = dict(zip(unique, counts))
print(result)

{0: 64, 1: 3751, 2: 2553}


In [None]:
#Estimate counterfactuals for treatment 1
treatment1 = df[df['treatment'] == 1]
treatment1 = treatment1.drop(columns=['treatment', 'rating_bucket'])

In [None]:
#Extract text embeddings
treatment1_bert = create_embeddings(treatment1)

100%|██████████| 6910/6910 [02:05<00:00, 55.00it/s]


In [None]:
#Concatenate tabular and text data
treatment1 = treatment1.drop(columns=['text'])
X_combined_1 = pd.concat([treatment1, treatment1_bert], axis = 1)

In [None]:
treatment1_counterfactual2 = xgb_combined.predict(X_combined_1)

# Create Matrix with all the counterfactual estimates

In [None]:
treatment0_row_rewards = df[df['treatment'] == 0][['rating_bucket']]
treatment0_row_rewards

Unnamed: 0,rating_bucket
3,0
6,2
7,1
13,0
14,1
...,...
18817,2
18818,2
18825,1
18827,0


In [None]:
# Ensure treatment0_counterfactual1 is a numpy array
treatment0_counterfactual1 = np.array(treatment0_counterfactual1)

# Make a copy of the DataFrame (optional but safe if working on slices)
treatment0_row_rewards = treatment0_row_rewards.copy()

# Explicitly insert the numpy array as a new column
treatment0_row_rewards.insert(
    loc=len(treatment0_row_rewards.columns),  # Add as the last column
    column='treatment1',                      # Column name
    value=treatment0_counterfactual1          # Numpy array to insert
)

In [None]:
# Ensure treatment0_counterfactual1 is a numpy array
treatment0_counterfactual2 = np.array(treatment0_counterfactual2)

# Make a copy of the DataFrame (optional but safe if working on slices)
treatment0_row_rewards = treatment0_row_rewards.copy()

# Explicitly insert the numpy array as a new column
treatment0_row_rewards.insert(
    loc=len(treatment0_row_rewards.columns),  # Add as the last column
    column='treatment2',                      # Column name
    value=treatment0_counterfactual2          # Numpy array to insert
)

In [None]:
treatment0_row_rewards.rename(columns = {'rating_bucket': 'treatment0'}, inplace = True)

In [None]:
treatment0_row_rewards.head()

Unnamed: 0,treatment0,treatment1,treatment2
3,0,1,1
6,2,2,2
7,1,1,1
13,0,2,2
14,1,1,1


In [None]:
treatment1_row_rewards = df[df['treatment'] == 1][['rating_bucket']]
treatment1_row_rewards

Unnamed: 0,rating_bucket
0,2
2,1
4,2
5,2
8,2
...,...
18816,1
18819,2
18823,2
18826,2


In [None]:
# Ensure treatment0_counterfactual1 is a numpy array
treatment1_counterfactual0 = np.array(treatment1_counterfactual0)

# Make a copy of the DataFrame (optional but safe if working on slices)
treatment1_row_rewards = treatment1_row_rewards.copy()

# Explicitly insert the numpy array as a new column
treatment1_row_rewards.insert(
    loc=len(treatment1_row_rewards.columns),  # Add as the last column
    column='treatment0',                      # Column name
    value=treatment1_counterfactual0          # Numpy array to insert
)

In [None]:
# Ensure treatment0_counterfactual1 is a numpy array
treatment1_counterfactual2 = np.array(treatment1_counterfactual2)

# Make a copy of the DataFrame (optional but safe if working on slices)
treatment1_row_rewards = treatment1_row_rewards.copy()

# Explicitly insert the numpy array as a new column
treatment1_row_rewards.insert(
    loc=len(treatment1_row_rewards.columns),  # Add as the last column
    column='treatment2',                      # Column name
    value=treatment1_counterfactual2          # Numpy array to insert
)

In [None]:
treatment1_row_rewards.rename(columns = {'rating_bucket': 'treatment1'}, inplace = True)

In [None]:
# Move treatment 0 column to the front
column_to_move = 'treatment0'
cols = [column_to_move] + [col for col in treatment1_row_rewards.columns if col != column_to_move]
treatment1_row_rewards = treatment1_row_rewards[cols]

In [None]:
treatment2_row_rewards = df[df['treatment'] == 2][['rating_bucket']]
treatment2_row_rewards

Unnamed: 0,rating_bucket
1,1
11,2
32,2
34,2
37,1
...,...
18820,2
18821,2
18822,1
18824,2


In [None]:
# Ensure treatment2_counterfactual0 is a numpy array
treatment2_counterfactual0 = np.array(treatment2_counterfactual0)

# Make a copy of the DataFrame (optional but safe if working on slices)
treatment2_row_rewards = treatment2_row_rewards.copy()

# Explicitly insert the numpy array as a new column
treatment2_row_rewards.insert(
    loc=len(treatment2_row_rewards.columns),  # Add as the last column
    column='treatment0',                      # Column name
    value=treatment2_counterfactual0          # Numpy array to insert
)

In [None]:
# Ensure treatment2_counterfactual0 is a numpy array
treatment2_counterfactual1 = np.array(treatment2_counterfactual1)

# Make a copy of the DataFrame (optional but safe if working on slices)
treatment2_row_rewards = treatment2_row_rewards.copy()

# Explicitly insert the numpy array as a new column
treatment2_row_rewards.insert(
    loc=len(treatment2_row_rewards.columns),  # Add as the last column
    column='treatment1',                      # Column name
    value=treatment2_counterfactual1         # Numpy array to insert
)

In [None]:
treatment2_row_rewards.rename(columns = {'rating_bucket': 'treatment2'}, inplace = True)

In [None]:
# Move treatment 0 column to the front
column_to_move = 'treatment0'
cols = [column_to_move] + ['treatment1'] + ['treatment2']
treatment2_row_rewards = treatment2_row_rewards[cols]

In [None]:
treatment2_row_rewards

Unnamed: 0,treatment0,treatment1,treatment2
1,1,0,1
11,2,2,2
32,2,2,2
34,2,2,2
37,1,1,1
...,...,...,...
18820,2,2,2
18821,2,2,2
18822,1,1,1
18824,1,2,2


In [None]:
rewards = pd.concat([treatment0_row_rewards, treatment1_row_rewards, treatment2_row_rewards], axis=0)

In [None]:
rewards

Unnamed: 0,treatment0,treatment1,treatment2
3,0,1,1
6,2,2,2
7,1,1,1
13,0,2,2
14,1,1,1
...,...,...,...
18820,2,2,2
18821,2,2,2
18822,1,1,1
18824,1,2,2


In [None]:
# Save the DataFrame as a CSV file
file_name = "rewards_matrix.csv"
rewards.to_csv(file_name, index=False)  # Set index=False to exclude the index in the CSV file

# Download the file to your local system
from google.colab import files
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Prepare Data Matrix for Optimal Policy Trees

In [None]:
df_opt = df.copy()

In [None]:
#Reorder df_opt to match the indices of rewards
df_opt_reordered = df_opt.reindex(rewards.index)

In [None]:
df_opt_reordered

Unnamed: 0,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,number_of_reviews,bedrooms,days_between_reviews,...,property_type_Townhouse,room_type_Private room,room_type_Shared room,cancellation_policy_moderate,cancellation_policy_strict,neighbourhood_group_Low Frequency,neighbourhood_group_Medium Frequency,text,treatment,rating_bucket
3,-0.486712,0.949421,1,1,1,0.424240,0,0.994249,-0.223636,0.554378,...,0,1,0,0,1,0,1,THE LIBRARY LOUNGE Cozy room in my big private...,0,0
6,-0.486712,-0.328183,1,1,1,-2.367434,0,-0.622575,-0.223636,-0.844639,...,0,1,0,0,1,0,1,"Modern Room in a Cozy, Colorful Apartment The ...",0,2
7,-0.486712,-0.328183,1,1,1,-0.421722,1,-0.160625,-0.223636,-0.681780,...,0,1,0,1,0,1,0,Intimate Isolate Oasis Guest are provided face...,0,1
13,-0.486712,-0.328183,1,1,1,-0.011027,0,-0.564831,-0.223636,-0.711212,...,0,1,0,1,0,0,0,Private room 5 min from manhattan! Welcome! Th...,0,0
14,-1.031323,-0.328183,1,1,1,-0.421722,0,-0.535959,-0.223636,-0.197127,...,0,1,0,0,1,0,0,Artist's room in the heart of chic Bed-Sty Sta...,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18820,0.602511,2.227024,0,1,1,0.424240,1,-0.564831,-0.223636,0.501400,...,0,0,0,0,1,0,1,Luxury 2 BEDS/2 BATH Midtown/Central Park Cent...,2,2
18821,-0.486712,-0.328183,1,1,1,0.424240,0,-0.593703,-0.223636,-0.697477,...,0,0,0,0,1,0,0,Best Deal! Central Park! Clean and New! NEWLY ...,2,2
18822,-0.486712,-0.328183,1,1,0,0.011560,1,-0.362728,-0.223636,-0.269727,...,0,1,0,1,0,0,0,Cosy 1 bdr in Midtown West My place is close t...,2,1
18824,0.602511,-0.328183,0,1,1,0.424240,0,-0.535959,1.167497,-0.399229,...,0,0,0,0,1,0,0,One of a Kind 19th Century Williamsburg Wonder...,2,2


In [None]:
#Testing textual analysis
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
# Function to extract BERT embeddings
def get_bert_embeddings(texts, tokenizer, bert_model):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Move inputs to GPU
    inputs = {key: val.to(device) for key, val in inputs.items()}

    bert_model.eval()
    outputs = bert_model(inputs['input_ids'])
    # Use the mean of the last hidden state as the embeddings
    return np.mean(outputs.last_hidden_state.detach().cpu().numpy(), axis=1).squeeze()


In [None]:
def create_embeddings(df):
    """
    Parameters::
        df: DataFrame with a column named "text"

    Returns::
        emb_df: DataFrame with 768 columns; each row contains the embeddings for the text in the corresponding row of df.
    """
    embeddings = []

    # Loop through the rows of the dataframe. Pass the text through the bert model and get embeddings using the get_bert_embeddings function
    for i in tqdm(range(0, df.shape[0])):
        text = df['text'].iloc[i] # TODO
        full_embedding = get_bert_embeddings(texts = text, tokenizer= tokenizer, bert_model= bert_model) # TODO
        embeddings.append(full_embedding) # TODO (or = full_embedding?)

    emb_df =  pd.DataFrame(np.array(embeddings), columns=[f"emb_{i}" for i in range(768)])

    emb_df = emb_df.set_index(df.index)

    return emb_df

In [None]:
#Apply BERT embeddings on train and test text data
X_text_final = create_embeddings(df_opt_reordered)

100%|██████████| 18831/18831 [05:49<00:00, 53.95it/s]


In [None]:
X_final = pd.concat([df_opt_reordered, X_text_final], axis=1)

In [None]:
# Save the DataFrame as a CSV file
file_name = "X_final.csv"
X_final.to_csv(file_name, index=False)  # Set index=False to exclude the index in the CSV file

# Download the file to your local system
from google.colab import files
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Separate features (X) and outcome (y)
X = X_final.drop(columns=['rating_bucket'])
y = X_final['rating_bucket']

# Perform the 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Combine features and outcomes for training and testing sets
train_set = pd.concat([X_train, y_train], axis=1)
test_set = pd.concat([X_test, y_test], axis=1)

In [None]:
# Get the indices from train_set and test_set
train_indices = train_set.index
test_indices = test_set.index

# Subset the rewards DataFrame to match the train and test indices
rewards_train = rewards.loc[train_indices]
rewards_test = rewards.loc[test_indices]

In [None]:
# Save the DataFrame as a CSV file
file_name = "rewards_train.csv"
rewards_train.to_csv(file_name, index=False)  # Set index=False to exclude the index in the CSV file

# Download the file to your local system
from google.colab import files
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Save the DataFrame as a CSV file
file_name = "rewards_test.csv"
rewards_test.to_csv(file_name, index=False)  # Set index=False to exclude the index in the CSV file

# Download the file to your local system
from google.colab import files
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Save the DataFrame as a CSV file
file_name = "train_set.csv"
train_set.to_csv(file_name, index=False)  # Set index=False to exclude the index in the CSV file

# Download the file to your local system
from google.colab import files
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Save the DataFrame as a CSV file
file_name = "test_set.csv"
test_set.to_csv(file_name, index=False)  # Set index=False to exclude the index in the CSV file

# Download the file to your local system
from google.colab import files
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>