<div align="center">

# RIO Airbnb Project - Text Mining
**Latest Update:** *27th February 2023*

</div>

---

#### **Import required libraries & dataset**

In [1]:
import sys, os, re
sys.path.append(os.path.abspath("../"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics



import airbnblib.cleaning as cln

In [10]:
# Gets the dataset CSV
newest_listing_csv = os.listdir("../dataset/listings/")[-1]
listings_df = pd.read_csv(f"../dataset/listings/{newest_listing_csv}")

newest_listing_csv

'listings_cleaned_20230303.csv'

In [2]:
# Gets the latest zipped cleaned version of the dataset
newest_listing_zip = os.listdir("../dataset_zipped/listings/")[-1]
listings_df = pd.read_csv(f"../dataset_zipped/listings/{newest_listing_zip}")

newest_listing_zip

'listings_cleaned_20230303.zip'

#### **Revert text columns from string to list if necessary**

In [12]:
for col in ['name', 'description', 'neighborhood_overview', 'host_about']:
  listings_df[col] = listings_df[col].map(lambda x: x.rstrip("']").lstrip("['").split("', '"))

listings_df['name'][1]

In [None]:
listings_df = listings_df.drop(['Unnamed: 0.1', 'Unnamed: 0', 'amenities'], axis=1)

#### **Remove HTML tags & non UTF-8 characters**

In [3]:
from bs4 import BeautifulSoup

def utf8_only(text) -> str:
  return "".join(
    re.findall("[a-zA-Z' ]",
      bytes(BeautifulSoup(text).get_text(separator=' ') if (type(text) == str) else '', 'utf-8').decode('utf-8', 'ignore')
  ))
for col in ['name', 'description', 'neighborhood_overview', 'host_about']:
  listings_df[col] = listings_df[col].apply(utf8_only)

  bytes(BeautifulSoup(text).get_text(separator=' ') if (type(text) == str) else '', 'utf-8').decode('utf-8', 'ignore')
  bytes(BeautifulSoup(text).get_text(separator=' ') if (type(text) == str) else '', 'utf-8').decode('utf-8', 'ignore')


In [20]:
listings_df[['name', 'description', 'neighborhood_overview', 'host_about']].head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,Panoramic Ocean View Venice Beach,Craftsmen style penthouse with ocean view from...,About as close to the beach as you can live in...,Serious yoga practice Been studying for year...
1,Spanish Bungalow Guest House LA CA plus nights,PRIVATE GUEST HOUSE The space Private Guest ho...,Local LA Community shops and restaurants in ...,male educated Healthcare ProfessionalCycle r...
2,Boho Chic FlatSteps to Beach,Bright airy quiet bdr located just steps from...,You are in one of the beach locations in Los A...,HealthWellness Professional Author Passionate ...
3,Guest House With Its Own EntranceExit and Hot Tub,Fully selfcontained separate structure with it...,We are close to Venice without the hassle of n...,Easy going hostess Enjoy your stay I am an exp...
4,Amazing bright elegant condo park front UPGRADED,Unit upgraded with new bamboo flooring brand ...,,Paolo Privitera CEO EvensiPaolo MIT MBA Phone ...


<h2 align='center'>Text preprocessing with NLTK</h2>

#### Create a small sample size

In [9]:
text_df = listings_df[['name', 'description', 'neighborhood_overview', 'host_about']].copy()

#### 1. Tokenizing & removing stopwords

In [10]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(entry):
  return [w for w in "".join(entry).split(" ") if not (w.lower() in stop_words or w == '')]

for col in text_df.columns:
  text_df[col] = text_df[col].apply(remove_stopwords)
text_df.head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,"[Panoramic, Ocean, View, Venice, Beach]","[Craftsmen, style, penthouse, ocean, view, roo...","[close, beach, live, Venice, boardwalk]","[Serious, yoga, practice, studying, years, Spe..."
1,"[Spanish, Bungalow, Guest, House, LA, CA, plus...","[PRIVATE, GUEST, HOUSE, space, Private, Guest,...","[Local, LA, Community, shops, restaurants, wal...","[male, educated, Healthcare, ProfessionalCycle..."
2,"[Boho, Chic, FlatSteps, Beach]","[Bright, airy, quiet, bdr, located, steps, sec...","[one, beach, locations, Los, Angeles, Peninsul...","[HealthWellness, Professional, Author, Passion..."
3,"[Guest, House, EntranceExit, Hot, Tub]","[Fully, selfcontained, separate, structure, en...","[close, Venice, without, hassle, parking, spaces]","[Easy, going, hostess, Enjoy, stay, experience..."
4,"[Amazing, bright, elegant, condo, park, front,...","[Unit, upgraded, new, bamboo, flooring, brand,...",[],"[Paolo, Privitera, CEO, EvensiPaolo, MIT, MBA,..."


#### Stemming: `PorterStemmer()`

In [34]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
porter_df = text_df.copy()

def porter_stemming(word_list):
  return [stemmer.stem(word) for word in word_list]

for col in porter_df.columns:
  porter_df[col] = porter_df[col].apply(porter_stemming)
porter_df.head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,"[panoram, ocean, view, venic, beach]","[craftsmen, style, penthous, ocean, view, room...","[close, beach, live, venic, boardwalk]","[seriou, yoga, practic, studi, year, spend, th..."
1,"[spanish, bungalow, guest, hous, la, ca, plu, ...","[privat, guest, hous, space, privat, guest, ho...","[local, la, commun, shop, restaur, walk, dista...","[male, educ, healthcar, professionalcycl, run,..."
2,"[boho, chic, flatstep, beach]","[bright, airi, quiet, bdr, locat, step, seclud...","[one, beach, locat, lo, angel, peninsula, mari...","[healthwel, profession, author, passion, trave..."
3,"[guest, hous, entranceexit, hot, tub]","[fulli, selfcontain, separ, structur, entrance...","[close, venic, without, hassl, park, space]","[easi, go, hostess, enjoy, stay, experienc, tr..."
4,"[amaz, bright, eleg, condo, park, front, upgrad]","[unit, upgrad, new, bamboo, floor, brand, new,...",[],"[paolo, privitera, ceo, evensipaolo, mit, mba,..."


#### Lemmatization: `WordNetLemmatizer()`

In [39]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
lem_df = text_df.copy()

def wnl_lemmatizer(word_list):
  return [wnl.lemmatize(word) for word in word_list]

for col in lem_df.columns:
  lem_df[col] = lem_df[col].apply(wnl_lemmatizer)
lem_df.head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,"[Panoramic, Ocean, View, Venice, Beach]","[Craftsmen, style, penthouse, ocean, view, roo...","[close, beach, live, Venice, boardwalk]","[Serious, yoga, practice, studying, year, Spen..."
1,"[Spanish, Bungalow, Guest, House, LA, CA, plus...","[PRIVATE, GUEST, HOUSE, space, Private, Guest,...","[Local, LA, Community, shop, restaurant, walki...","[male, educated, Healthcare, ProfessionalCycle..."
2,"[Boho, Chic, FlatSteps, Beach]","[Bright, airy, quiet, bdr, located, step, secl...","[one, beach, location, Los, Angeles, Peninsula...","[HealthWellness, Professional, Author, Passion..."
3,"[Guest, House, EntranceExit, Hot, Tub]","[Fully, selfcontained, separate, structure, en...","[close, Venice, without, hassle, parking, space]","[Easy, going, hostess, Enjoy, stay, experience..."
4,"[Amazing, bright, elegant, condo, park, front,...","[Unit, upgraded, new, bamboo, flooring, brand,...",[],"[Paolo, Privitera, CEO, EvensiPaolo, MIT, MBA,..."


In [12]:
listings_df[['name', 'description', 'neighborhood_overview', 'host_about']].head()

Unnamed: 0,name,description,neighborhood_overview,host_about
0,"[Panoramic, Ocean, View, Venice, Beach]","[Craftsmen, style, penthouse, ocean, view, roo...","[close, beach, live, Venice, boardwalk]","[Serious, yoga, practice, studying, years, Spe..."
1,"[Spanish, Bungalow, Guest, House, LA, CA, plus...","[PRIVATE, GUEST, HOUSE, space, Private, Guest,...","[Local, LA, Community, shops, restaurants, wal...","[male, educated, Healthcare, ProfessionalCycle..."
2,"[Boho, Chic, FlatSteps, Beach]","[Bright, airy, quiet, bdr, located, steps, sec...","[one, beach, locations, Los, Angeles, Peninsul...","[HealthWellness, Professional, Author, Passion..."
3,"[Guest, House, EntranceExit, Hot, Tub]","[Fully, selfcontained, separate, structure, en...","[close, Venice, without, hassle, parking, spaces]","[Easy, going, hostess, Enjoy, stay, experience..."
4,"[Amazing, bright, elegant, condo, park, front,...","[Unit, upgraded, new, bamboo, flooring, brand,...",[],"[Paolo, Privitera, CEO, EvensiPaolo, MIT, MBA,..."


In [16]:
print(listings_df['neighborhood_overview'][4])

['']


#### Get common statistics for the text columns

In [23]:
import statistics

for col in ['name', 'description', 'neighborhood_overview', 'host_about']:
  print("Average for '{}': {} words".format(
    " ".join(col.split('_')), round(statistics.mean([len(name) for name in listings_df[col] if name != ['']]))
  ))

Average for 'name': 5 words
Average for 'description': 70 words
Average for 'neighborhood overview': 34 words
Average for 'host about': 35 words


In [13]:
from datetime import datetime

today = str(datetime.now()).split(" ")[0].replace('-', '')
listings_df.to_csv(f"../dataset/listings/listings_cleaned_{today}.csv", index=False)