# Feature Generation - Airbnb Reviews

In [None]:
import re
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

%matplotlib inline
plt.rcParams.update({"font.size": 13, "figure.figsize": [15,6]})
pd.set_option("display.max_colwidth", None)

## 1. Read Dataset

In [None]:
filename = "reviews.csv"
df = pd.read_csv(filename)

In [None]:
df.head()

## 2. Exploratory Data Analysis

In [None]:
df.info()

From dataset information, we can see that:

- The shape of the dataset is (410291, 6).
- Attribute `comments` have missing values.

In [None]:
# Check number of missing comments
print(df["comments"].isna().value_counts())
df[df["comments"].isna()].head() 

In [None]:
# Check for noisy comments
garbage_comments_count = df[df["comments"].str.len() == 1]["comments"].value_counts()
sum(garbage_comments_count)

In [None]:
# Check review distribution over time
df["date"].sort_values()
pd.to_datetime(df["date"]).value_counts().sort_index().plot(title="Review count over years")
plt.draw()

Airbnb was founded in August 2008. So, during the first few years, the number of reviews was very low. Then gradually it started increasing until the Covid pandemic began. This figure is created before removing the missing/garbage reviews. The figure might change a bit after removing those reviews. But the outcome is expected to remain the same.

## 3. Data Preprocessing

In [None]:
# Remove missing comments
df.dropna(subset=["comments"], axis=0, inplace=True)
df.shape

In [None]:
# Remove noisy comments
garbage_comments_idx = df[df["comments"].str.len() == 1].index
df.drop(index=garbage_comments_idx, axis=0, inplace=True)

In [None]:
# Final dataframe shape
df.shape

In [None]:
filename_pre = "reviews_preprocessed.csv"
df.to_csv(filename_pre, sep=";", index=False)

## 4. Feature Generation

In [None]:
# Load the preprocessed dataset
pre_df = pd.read_csv(filename_pre, sep=";")

### 4.1 Named Entity Recognition

In [None]:
# https://spacy.io/models/en#en_core_web_lg
nlp = spacy.load("en_core_web_lg")

def get_lang_detector(nlp, name):
    return LanguageDetector()
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

In [None]:
new_columns = ['CARDINAL', 'DATE', 'EMAIL', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL',
               'ORG', 'PERCENT', 'PERSON', 'PHONE', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
empty_dict = {column: 0 for column in new_columns}

In [None]:
def get_named_entities(text):
    ents_list = []
    ents_dict = defaultdict(set)
    
    # Find emails
    emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.-]+", text)
    if emails:
        ents_dict["EMAIL"] = set(emails)
    for email in emails:
        ents_list.append("EMAIL")
        text = text.replace(email, "")
    
    # Find phone numbers
    phones = re.findall(r"((?:\+\d{2}[-\.\s]??|\d{4}[-\.\s]??)?(?:\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}))", text)
    if phones:
        ents_dict["PHONE"] = set(phones)
    for phone in phones:
        ents_list.append("PHONE")
        text = text.replace(phone, "")
    
    # Find named entites
    doc = nlp(text)
    for ent in doc.ents:
        ents_list.append(ent.label_)
        ents_dict[ent.label_].add(ent.text)
    
    ents_list.sort()
    ents_count_dict = dict(Counter(ents_list))
    ents_count_dict = {**empty_dict, **ents_count_dict}
    
    result = [doc._.language["language"], dict(ents_dict)] + list(ents_count_dict.values())
    return result

In [None]:
%%timeit -r 1
columns = ["lang", "ents"] + new_columns
pre_df[columns] = pre_df.apply(lambda record: get_named_entities(record["comments"]), axis=1, result_type="expand")

In [None]:
pre_df[["comments", "ents"]].head()

In [None]:
# Filter English comments only
pre_df = pre_df[pre_df["lang"]=="en"].copy()

In [None]:
filename_ne = "reviews_named_entities.csv"
pre_df.to_csv(filename_ne, sep=";", index=False)