# Semantic Similarity Evaluation - Airbnb Reviews

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy

%matplotlib inline
plt.rcParams.update({"font.size": 14})

In [None]:
filename = "reviews_anonymized.csv"
df = pd.read_csv(filename, sep=";")
df.shape

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
def get_similarity(review):
    if not bool(review["suod"]):
        return 1.0
    
    comment_ents_list = []
    comment_annonymized_ents_list = []
    
    # Find emails
    emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.-]+", str(review["comments"]))
    comment_ents_list += [email for email in emails]
    emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.-]+", str(review["comments_anonymized"]))
    comment_annonymized_ents_list += [email for email in emails]
    
    # Find phone numbers
    phones = re.findall(r"((?:\+\d{2}[-\.\s]??|\d{4}[-\.\s]??)?(?:\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}))", str(review["comments"]))
    comment_ents_list += [phone for phone in phones]
    phones = re.findall(r"((?:\+\d{2}[-\.\s]??|\d{4}[-\.\s]??)?(?:\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}))", str(review["comments_anonymized"]))
    comment_annonymized_ents_list += [phone for phone in phones]
    
    # Find named entites
    doc = nlp(str(review["comments"]))
    comment_ents_list += [ent.text for ent in doc.ents]
    doc = nlp(str(review["comments_anonymized"]))
    comment_annonymized_ents_list += [ent.text for ent in doc.ents]

    doc1 = nlp(" ".join(comment_ents_list))
    doc2 = nlp(" ".join(comment_annonymized_ents_list))
    return doc1.similarity(doc2)

In [None]:
df["similarity"] = df.apply(lambda review: get_similarity(review), axis=1, result_type="expand")

In [None]:
filename = "reviews_anonymized.csv"
df.to_csv(filename, sep=";", index=False)

In [None]:
df_outlier = df[df["suod"]==1]["similarity"].copy()

In [None]:
Q1, Q2, Q3 = df_outlier.quantile(0.25), df_outlier.quantile(0.5), df_outlier.quantile(0.75)

plt.figure(figsize=(10,5))
df_outlier.hist(bins=100)
plt.axvline(x=Q1, color='c', linewidth=3, label=f'Q1: {Q1:.3f}')
plt.axvline(x=Q2, color='m', linewidth=3, label=f'Q2: {Q2:.3f}')
plt.axvline(x=Q3, color='y', linewidth=3, label=f'Q3: {Q3:.3f}')
plt.xlabel('\nSimilarity Value')
plt.ylabel('Count')
plt.title('Semantic Similarity Distribution\n')
plt.legend()
plt.savefig('figures/text_similarity.png', bbox_inches="tight")
plt.show()

In [None]:
less_than_zero = df_outlier[df_outlier<=0].count() / df_outlier.count()
print(f"Percentage of comments less than zero similarity score: {less_than_zero:.3f}")

In [None]:
nlp("David Berlin").similarity(nlp("Victor Munich"))

In [None]:
nlp("David Berlin").similarity(nlp("XXXXX XXXXX"))

In [None]:
nlp("XXXXX XXXXX").similarity(nlp("David Berlin"))