# Outlier Masking - Airbnb Reviews

In [None]:
import ast
import random
import numpy as np
import pandas as pd
from faker import Faker

pd.set_option("display.max_colwidth", None)

In [None]:
# Load the dataset containing named entities
filename_ne = "reviews_outlier_predtions.csv"
outlier_df = pd.read_csv(filename_ne, sep=";")

In [None]:
outlier_df[["comments", "ents", "suod"]].head()

## Data Masking

In [None]:
fake = Faker("en_US")

ordinal_list = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", "first", "second", "third"]

mask_ents = {
    'CARDINAL': lambda: fake.random_int(min=1, max=10),
    'DATE': fake.date,
    'EMAIL': fake.free_email,
    'EVENT': fake.street_name,
    'FAC': fake.street_name, 
    'GPE': random.choice([fake.city, fake.country]),
    'LANGUAGE': fake.language_name,
    'LAW': fake.street_name,
    'LOC': fake.street_address,
    'MONEY': fake.pricetag,
    'NORP': fake.country,
    'ORDINAL': lambda: random.choice(ordinal_list),
    'ORG': fake.company,
    'PERCENT': lambda: f"{fake.random_int(min=1, max=100)}%",
    'PERSON': fake.first_name, 
    'PHONE': fake.phone_number,
    'PRODUCT': fake.street_name, 
    'QUANTITY': lambda: fake.random_int(min=1, max=100), 
    'TIME': fake.time,
    'WORK_OF_ART': fake.street_name
}

In [None]:
def mask_data(review):
    comment = review["comments"]
    if not bool(review["suod"]):
        return comment
    
    ent_dict = ast.literal_eval(review["ents"])
    ent_hash = {}
    for key, ents in ent_dict.items():
        for ent in ents:
            if ent not in ent_hash:
                ent_hash[ent] = str(mask_ents[key]())
            comment = comment.replace(ent, ent_hash[ent])
    return comment

In [None]:
outlier_df["comments_anonymized"] = outlier_df[["comments", "ents", "suod"]].apply(mask_data, axis=1)

In [None]:
outlier_df[["comments", "ents", "suod", "comments_anonymized"]].head()

In [None]:
filename = "reviews_anonymized.csv"
outlier_df.to_csv(filename, sep=";", index=False)