<div align="center">

# RIO Airbnb Project - Text Mining
**Latest Update:** *27th February 2023*

</div>

---

#### **Import required libraries & dataset**

In [1]:
import sys, os, re
sys.path.append(os.path.abspath("../"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics



import airbnblib.cleaning as cln

In [84]:
# Gets the latest dataset CSV
newest_listing_csv = os.listdir("../dataset/listings/")[-1]
listings_df = pd.read_csv(f"../dataset/listings/{newest_listing_csv}")

try:
  listings_df = listings_df.drop(['Unnamed: 0'], axis=1)
except KeyError:
  pass

newest_listing_csv

'listings_cleaned_20230315.csv'

In [2]:
# Gets the latest zipped cleaned version of the dataset
newest_listing_zip = os.listdir("../dataset_zipped/listings/")[-1]
listings_df = pd.read_csv(f"../dataset_zipped/listings/{newest_listing_zip}")

try:
  listings_df = listings_df.drop(['Unnamed: 0'], axis=1)
except KeyError:
  pass

newest_listing_zip

'listings_cleaned_20230303.zip'

In [79]:
listings_df.head()

Unnamed: 0,id,name,description,neighborhood_overview,host_since,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,...,hair_dryer,hot_water,long_term_stays_allowed,smoke_alarm,refrigerator,kitchen,air_conditioning,tv,essentials,zip_code
0,183319.0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,-15180,Serious yoga practice. Been studying for 25 y...,within a day,100%,100%,0.0,...,0,1,1,1,1,1,0,1,0,90291.0
1,51307.0,Spanish Bungalow Guest House LA CA. 30 plus ni...,"PRIVATE GUEST HOUSE The space Private, Guest h...","Local LA Community , shops and restaurants in...",-14867,"male , educated . Healthcare Professional\r\nC...",does not respond,,,0.0,...,0,1,1,1,1,1,1,1,1,90039.0
2,184314.0,Boho Chic Flat..Steps to Beach!,"Bright, airy, quiet 1 bdr located just steps f...",You are in one of the beach locations in Los A...,-15183,Health-Wellness Professional. Author. Passiona...,does not respond,,,0.0,...,1,0,1,1,0,1,0,1,1,90292.0
3,51498.0,Guest House With Its Own Entrance/Exit and Hot...,"Fully self-contained, separate structure, with...",We are close to Venice without the hassle of n...,-14868,Easy going hostess!! Enjoy your stay. \n\nI am...,within an hour,100%,88%,0.0,...,1,1,1,1,0,1,1,1,1,90066.0
4,109.0,Amazing bright elegant condo park front UPGRADED,"Unit upgraded with new bamboo flooring, brand...",,-14057,"Paolo Privitera, CEO Evensi\n\nPaolo, MIT MBA ...",does not respond,,,0.0,...,1,0,1,1,0,1,1,1,1,90230.0


#### **Revert text columns from string to list if necessary**

In [12]:
for col in ['name', 'description', 'neighborhood_overview', 'host_about']:
  listings_df[col] = listings_df[col].map(lambda x: x.rstrip("']").lstrip("['").split("', '"))

listings_df['name'][1]

In [None]:
listings_df = listings_df.drop(['Unnamed: 0.1', 'Unnamed: 0', 'amenities'], axis=1)

#### **Remove HTML tags & non UTF-8 characters**

In [85]:
from bs4 import BeautifulSoup
import spacy


def utf8_only(text) -> str:
  return \
    re.sub(" +", " ", 
      "".join(
        re.findall("[a-zA-Z' ]", 
        re.sub("[^a-zA-Z']", " ", 
          BeautifulSoup(text).get_text(separator=' ') if (type(text) == str) else '')))
      .lower()
    )

for col in ['name', 'description', 'neighborhood_overview', 'host_about']:
  listings_df[col] = listings_df[col].apply(utf8_only)

listings_df[['name', 'description', 'neighborhood_overview', 'host_about']].head()

  BeautifulSoup(text).get_text(separator=' ') if (type(text) == str) else '')))
  BeautifulSoup(text).get_text(separator=' ') if (type(text) == str) else '')))


Unnamed: 0,name,description,neighborhood_overview,host_about
0,panoramic ocean view venice beach,craftsmen style penthouse with ocean view from...,about as close to the beach as you can live in...,serious yoga practice been studying for years ...
1,spanish bungalow guest house la ca plus nights,private guest house the space private guest ho...,local la community shops and restaurants in wa...,male educated healthcare professional cycle ru...
2,boho chic flat steps to beach,bright airy quiet bdr located just steps from ...,you are in one of the beach locations in los a...,health wellness professional author passionate...
3,guest house with its own entrance exit and hot...,fully self contained separate structure with i...,we are close to venice without the hassle of n...,easy going hostess enjoy your stay i am an exp...
4,amazing bright elegant condo park front upgraded,unit upgraded with new bamboo flooring brand ...,,paolo privitera ceo evensi paolo mit mba phone...


In [89]:
ACRONYMS = {
  'bedroom': ('bdr', 'bdrm'),
  'beach': ['bch'],
  'penthouse': ['pnthous'],
  'hollywood': ('hollywd'),
  'apartment': (' apt')
}

def convert_acronyms(line):
  line = [] if (line == '') else line.split(' ')
  
  for word in ACRONYMS.keys():
      line = [i if (i not in ACRONYMS[word]) else (word) for i in line]
  return " ".join(line)


for col in ['name', 'description', 'neighborhood_overview', 'host_about']:
  listings_df[col] = listings_df[col].map(lambda x: convert_acronyms(x))

listings_df[['name', 'description', 'neighborhood_overview', 'host_about']].head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,panoramic ocean view venice beach,craftsmen style penthouse with ocean view from...,about as close to the beach as you can live in...,serious yoga practice been studying for years ...
1,spanish bungalow guest house la ca plus nights,private guest house the space private guest ho...,local la community shops and restaurants in wa...,male educated healthcare professional cycle ru...
2,boho chic flat steps to beach hollywood,bright airy quiet bedroom located just steps f...,you are in one of the beach locations in los a...,health wellness professional author passionate...
3,guest house with its own entrance exit and hot...,fully self contained separate structure with i...,we are close to venice without the hassle of n...,easy going hostess enjoy your stay i am an exp...
4,amazing bright elegant condo park front upgraded,hollywood unit upgraded with new bamboo floori...,,paolo privitera ceo evensi paolo mit mba phone...


In [82]:
listings_df['description'][2]

'bright airy quiet bdr located just steps from the most secluded clean beach in la super spacious high ceilings polished concrete floors sweet garden balcony bath w separate vanity ride a bike or walk to famed venice abbot kinney blvd block from the best beach in la the space perfect beach getaway steps from the best semi private beach in la spacious bedroom loft style with high ceilings polished concrete floors tons of natural sunlight and refreshing ocean breeze throughout a perfect getaway for the health conscious traveler feel at home with a fully equipped abode boho chic beach flat in a cozy garden courtyard bldg full garden balcony birds chirping and hummingbirds abound amenities in hdtv dvd appletv full cable with hbo washer dryer in bldg large kitchen full bath with separate vanity dressing room sonos stereo throughout f'

In [87]:
listings_df['neighborhood_overview'][4]

''

<h2 align='center'>Text preprocessing</h2>

#### Initialization

In [None]:
import spacy, nltk



In [90]:
text_df = listings_df[['name', 'description', 'neighborhood_overview', 'host_about']].copy()[:1000]

#### 1. Tokenizing & removing stopwords

In [49]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(entry):
  return " ".join([w for w in "".join(entry).split(" ") if not (w.lower() in stop_words or w == '')])

for col in text_df.columns:
  text_df[col] = text_df[col].apply(remove_stopwords)
text_df.head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse ocean view rooms Par...,close beach live Venice boardwalk,Serious yoga practice studying years Spend thr...
1,Spanish Bungalow Guest House LA CA plus nights,PRIVATE GUEST HOUSE space Private Guest house ...,Local LA Community shops restaurants walking d...,male educated Healthcare Professional Cycle ru...
2,Boho Chic Flat Steps Beach,Bright airy quiet bdr located steps secluded c...,one beach locations Los Angeles Peninsula Mari...,Health Wellness Professional Author Passionate...
3,Guest House Entrance Exit Hot Tub,Fully self contained separate structure entran...,close Venice without hassle parking spaces,Easy going hostess Enjoy stay experienced trav...
4,Amazing bright elegant condo park front UPGRADED,Unit upgraded new bamboo flooring brand new Ul...,,Paolo Privitera CEO Evensi Paolo MIT MBA Phone...


<h2 align="center"> spaCy </h2>

#### Lemmatizing: `spacy en_core_web_sm`

In [91]:
web_sm = spacy.load('en_core_web_sm')

stopwords = web_sm.Defaults.stop_words

def remove_stopwords(entry):
  return " ".join([w for w in "".join(entry).split(" ") if not (w.lower() in stopwords or w == '')])

def spacy_lemmatizer(text):
  return " ".join([token.lemma_ for token in web_sm(text)]).lower()

web_sm_df = text_df.copy()

for col in web_sm_df.columns:
  web_sm_df[col] = web_sm_df[col].apply(remove_stopwords).apply(spacy_lemmatizer)

web_sm_df.head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,panoramic ocean view venice beach,craftsman style penthouse ocean view room park...,close beach live venice boardwalk hollywood,yoga practice study year spend month thailand ...
1,spanish bungalow guest house la plus night,private guest house space private guest house ...,local la community shop restaurant walk distan...,male educate healthcare professional cycle run...
2,boho chic flat step beach hollywood,bright airy quiet bedroom locate step seclude ...,beach location los angeles peninsula marina de...,health wellness professional author passionate...
3,guest house entrance exit hot tub,fully self contain separate structure entrance...,close venice hassle parking space hollywood,easy go hostess enjoy stay experienced travele...
4,amazing bright elegant condo park upgrade,hollywood unit upgrade new bamboo floor brand ...,,paolo privitera ceo evensi paolo mit mba phone...


In [51]:
import spacy
web_md = spacy.load('en_core_web_md')

def spacy_lemmatizer(text):
  return " ".join([token.lemma_ for token in web_md(text)])

web_md_df = text_df.copy()

for col in text_df.columns:
  web_md_df[col] = text_df[col].apply(spacy_lemmatizer)

web_md_df.head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,Panoramic Ocean View Venice Beach,craftsman style penthouse ocean view room Park...,close beach live Venice boardwalk,serious yoga practice study year spend three m...
1,Spanish Bungalow Guest House LA CA plus night,PRIVATE GUEST HOUSE space private guest house ...,local LA Community shop restaurant walk distan...,male educate Healthcare Professional Cycle run...
2,Boho Chic Flat Steps Beach,bright airy quiet bdr locate step seclude clea...,one beach location Los Angeles Peninsula Marin...,Health Wellness Professional Author Passionate...
3,Guest House Entrance Exit Hot Tub,fully self contain separate structure entrance...,close Venice without hassle parking space,easy go hostess enjoy stay experienced travele...
4,amazing bright elegant condo park front UPGRADED,unit upgrade new bamboo flooring brand new Ult...,,Paolo Privitera CEO Evensi Paolo MIT MBA Phone...


#### Stemming: `PorterStemmer()`

In [34]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
porter_df = text_df.copy()

def porter_stemming(word_list):
  return [stemmer.stem(word) for word in word_list]

for col in porter_df.columns:
  porter_df[col] = porter_df[col].apply(porter_stemming)
porter_df.head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,"[panoram, ocean, view, venic, beach]","[craftsmen, style, penthous, ocean, view, room...","[close, beach, live, venic, boardwalk]","[seriou, yoga, practic, studi, year, spend, th..."
1,"[spanish, bungalow, guest, hous, la, ca, plu, ...","[privat, guest, hous, space, privat, guest, ho...","[local, la, commun, shop, restaur, walk, dista...","[male, educ, healthcar, professionalcycl, run,..."
2,"[boho, chic, flatstep, beach]","[bright, airi, quiet, bdr, locat, step, seclud...","[one, beach, locat, lo, angel, peninsula, mari...","[healthwel, profession, author, passion, trave..."
3,"[guest, hous, entranceexit, hot, tub]","[fulli, selfcontain, separ, structur, entrance...","[close, venic, without, hassl, park, space]","[easi, go, hostess, enjoy, stay, experienc, tr..."
4,"[amaz, bright, eleg, condo, park, front, upgrad]","[unit, upgrad, new, bamboo, floor, brand, new,...",[],"[paolo, privitera, ceo, evensipaolo, mit, mba,..."


#### Lemmatization: `WordNetLemmatizer()`

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm')

lem_df = text_df.copy()

def lemmatizer(word_list):
  return [word.lemma_ for word in nlp(" ".join(word_list))]



for col in lem_df.columns:
  lem_df[col] = lem_df[col].apply(lemmatizer)
lem_df.head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,"[Panoramic, Ocean, View, Venice, Beach]","[Craftsmen, style, penthouse, ocean, view, roo...","[close, beach, live, Venice, boardwalk]","[Serious, yoga, practice, studying, year, Spen..."
1,"[Spanish, Bungalow, Guest, House, LA, CA, plus...","[PRIVATE, GUEST, HOUSE, space, Private, Guest,...","[Local, LA, Community, shop, restaurant, walki...","[male, educated, Healthcare, ProfessionalCycle..."
2,"[Boho, Chic, FlatSteps, Beach]","[Bright, airy, quiet, bdr, located, step, secl...","[one, beach, location, Los, Angeles, Peninsula...","[HealthWellness, Professional, Author, Passion..."
3,"[Guest, House, EntranceExit, Hot, Tub]","[Fully, selfcontained, separate, structure, en...","[close, Venice, without, hassle, parking, space]","[Easy, going, hostess, Enjoy, stay, experience..."
4,"[Amazing, bright, elegant, condo, park, front,...","[Unit, upgraded, new, bamboo, flooring, brand,...",[],"[Paolo, Privitera, CEO, EvensiPaolo, MIT, MBA,..."


In [12]:
listings_df[['name', 'description', 'neighborhood_overview', 'host_about']].head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,"[Panoramic, Ocean, View, Venice, Beach]","[Craftsmen, style, penthouse, ocean, view, roo...","[close, beach, live, Venice, boardwalk]","[Serious, yoga, practice, studying, years, Spe..."
1,"[Spanish, Bungalow, Guest, House, LA, CA, plus...","[PRIVATE, GUEST, HOUSE, space, Private, Guest,...","[Local, LA, Community, shops, restaurants, wal...","[male, educated, Healthcare, ProfessionalCycle..."
2,"[Boho, Chic, FlatSteps, Beach]","[Bright, airy, quiet, bdr, located, steps, sec...","[one, beach, locations, Los, Angeles, Peninsul...","[HealthWellness, Professional, Author, Passion..."
3,"[Guest, House, EntranceExit, Hot, Tub]","[Fully, selfcontained, separate, structure, en...","[close, Venice, without, hassle, parking, spaces]","[Easy, going, hostess, Enjoy, stay, experience..."
4,"[Amazing, bright, elegant, condo, park, front,...","[Unit, upgraded, new, bamboo, flooring, brand,...",[],"[Paolo, Privitera, CEO, EvensiPaolo, MIT, MBA,..."


In [16]:
print(listings_df['neighborhood_overview'][4])

['']


#### Get common statistics for the text columns

In [23]:
import statistics

for col in ['name', 'description', 'neighborhood_overview', 'host_about']:
  print("Average for '{}': {} words".format(
    " ".join(col.split('_')), round(statistics.mean([len(name) for name in listings_df[col] if name != ['']]))
  ))

Average for 'name': 5 words
Average for 'description': 70 words
Average for 'neighborhood overview': 34 words
Average for 'host about': 35 words


In [13]:
from datetime import datetime

today = str(datetime.now()).split(" ")[0].replace('-', '')
listings_df.to_csv(f"../dataset/listings/listings_cleaned_{today}.csv", index=False)

In [None]:
sample_out_df = listings_df.head(1000)

In [92]:
web_sm_df.to_csv(f"../dataset/text/listings_text_cleaned.csv", index=False)