# Data Labeling Process - Airbnb Reviews

In [None]:
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams.update({"font.size": 13, "figure.figsize": [8,2]})
pd.set_option("display.max_colwidth", None)

In [None]:
samples = 50782 # 20% of data

In [None]:
# Load the dataset containing named entities
filename_ne = "reviews_named_entities.csv"
df = pd.read_csv(filename_ne, sep=";", nrows=samples)

In [None]:
columns = ['CARDINAL', 'DATE', 'EMAIL', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL',
               'ORG', 'PERCENT', 'PERSON', 'PHONE', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']

In [None]:
def label_data(review):
    ent_dict = ast.literal_eval(review["ents"])
    outlier = False
    if "EMAIL" in ent_dict or "PHONE" in ent_dict:
        outlier = True
    elif ("PERSON" in ent_dict or "ORG" in ent_dict) and len(ent_dict.keys())>1:
        outlier = True
    return 1 if outlier else 0

In [None]:
df["label"] = df.apply(label_data, axis=1)

In [None]:
df[df["label"]==1].shape

In [None]:
df[["comments", "ents", "label"]].head(10)

In [None]:
filename = "reviews_sample_labelled.csv"
df.to_csv(filename, sep=";", index=False)

In [None]:
df["label"].value_counts()

In [None]:
import spacy

for column in columns:
    print(f"{column} => {spacy.explain(column)}")