# 1.Import Dependencies

In [None]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import spacy
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import os
import zipfile
import json
import copy

In [None]:
# # uncomment to download package
# nltk.download('stopwords')

In [None]:
# # uncomment to download package
# !python3 -m spacy download en_core_web_md

In [None]:
# # uncomment to download package
# !python3 -m spacy download en_core_web_lg

# 2.Define Helper Functions

In [None]:
def spacy_vectorizer(amenity, spacyModel):
  vec = spacyModel(amenity)

  return vec

def findEntities(doc):
  entities = []
  if len(doc.ents) > 0:
    for ent in doc.ents:
      if ent.label_=='ORG':
        entities.append(ent)
  return entities

def entityCorrections(entities, errors):
  fin_entities = []
  if entities:
    for entity in entities:
      if entity.text not in errors:
        fin_entities.append(entity)

  return fin_entities


def fetch_pos(entities):
  start = None
  end = None
  positions = []
  for entity in entities:
    positions.append([entity.start_char, entity.end_char])

  # flatten
  flat_pos = [item for sublist in positions for item in sublist]

  # print(positions)
  # print(flat_pos)

  return flat_pos

def fetch_slices(flat_pos, amenity):
  start=False

  slices = []
  for index in range(len(amenity)):  
    if index in flat_pos and not start:
      slices.append('|')
      slices.append(index)
      start=True

    elif index in flat_pos and start:
      slices.append(index)
      slices.append('|')
      start=False

    else:
      slices.append(index)
    
  # print(slices)

  return slices

def split_pipe(slices):
  indices_str = [str(index) if type(index) is int else index for index in slices]
  splits = [line.split(',') for line in (',').join(indices_str).split('|')]

  # print(splits)

  return splits

def clean_splits(splits):
  final_splits = []
  for split in splits:
    fin_split = []
    for index in split:
      if index!='':
        fin_split.append(int(index))
    final_splits.append(fin_split)

  # print(final_splits)

  return final_splits

def fetch_keep_words(final_splits, flat_pos):
  keep_words = []
  for split in final_splits:
    if split:
      if split[0] not in flat_pos:
        keep_words.append(split)

  # print(keep_words)

  return keep_words

def fetch_final_slices(keep_words):
  final_slices = []
  for piece in keep_words:
    if len(piece)>1:
      final_slices.append([piece[0], piece[-1]])
    else:
      final_slices.append(piece)

  # print(final_slices)

  return final_slices

def build_new_amenity(final_slices, amenity):
  new_amenity = ''
  
  i = 0
  j = len(final_slices) - 1 # 0 based index
  for piece in final_slices:
    if len(piece)>1:
      if i >= j:
        new_amenity += amenity[piece[0]:piece[1]+1]
      else:
        new_amenity += amenity[piece[0]:piece[1]]
    else:
      new_amenity += amenity[piece[0]]
    i+=1

  # print(new_amenity)

  return new_amenity

def removeEntities(amenity, entities):
  fin_amenity=None
  new_amenity=''

  if len(entities)>0:

    # fetch start and end positions for each entity
    flat_pos = fetch_pos(entities)

    # determine slices for strings which are not entities
    slices = fetch_slices(flat_pos, amenity)

    # split by '|'
    splits = split_pipe(slices)

    # cleanup splits
    final_splits = clean_splits(splits)

    # determine which splits to keep
    keep_words = fetch_keep_words(final_splits, flat_pos)

    # convert splits to ranges
    final_slices = fetch_final_slices(keep_words)

    # build new amenity name
    new_amenity = build_new_amenity(final_slices, amenity)

  if new_amenity!='':
    fin_amenity=new_amenity
  else:
    fin_amenity=amenity

  fin_amenity = str.strip(fin_amenity)

  return fin_amenity


def split_strucute(amenity):
  return re.split('(\s\–\s|\s-\s|\:)', amenity)[0]

def remove_number(amenity):
  return re.sub('[\d*]', '', amenity)

def alphanumeric(amenity):
  amenity_alnum = []
  for token in re.split('\s', amenity):
    if re.fullmatch('\w+', token):
      amenity_alnum.append(token)
  
  new_amenity = (' ').join(amenity_alnum)

  return new_amenity

def stem(amenity):
  amenity_stem = []
  ps = PorterStemmer()

  for token in re.split('\s', amenity):
    amenity_stem.append(ps.stem(token))
  
  new_amenity = (' ').join(amenity_stem)

  return new_amenity

def low_case(amenity):
  amenity_low = (' ').join([token.lower() for token in amenity.split()])

  return amenity_low

def remove_stopwords(amenity):
  amenity_no_stopwords = []
  stop_words = set(stopwords.words('english'))

  for token in amenity.split():
    if token not in stop_words:
      amenity_no_stopwords.append(token)
  new_amenity = (' ').join(amenity_no_stopwords)

  return new_amenity


In [None]:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

def find_silhouette(X, k_min, k_max, step=1, area_plot_only=False):
  range_n_clusters = list(range(k_min, k_max+1, step))

  for n_clusters in range_n_clusters:
      # Create a subplot with 1 row and 2 columns
      if not area_plot_only:
        fig, (ax1, ax2) = plt.subplots(1, 2)
      else:
        fig, ax1 = plt.subplots(1, 1)

      fig.set_size_inches(15, 15)

      # The 1st subplot is the silhouette plot
      # The silhouette coefficient can range from -1, 1 but in this example all
      # lie within [-0.1, 1]
      ax1.set_xlim([-0.1, 1])
      # The (n_clusters+1)*10 is for inserting blank space between silhouette
      # plots of individual clusters, to demarcate them clearly.
      ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

      # Initialize the clusterer with n_clusters value and a random generator
      # seed of 10 for reproducibility.
      clusterer = KMeans(n_clusters=n_clusters, n_init="auto", random_state=42)
      cluster_labels = clusterer.fit_predict(X)

      # The silhouette_score gives the average value for all the samples.
      # This gives a perspective into the density and separation of the formed
      # clusters
      silhouette_avg = silhouette_score(X, cluster_labels)
      print(
          "For n_clusters =",
          n_clusters,
          "The average silhouette_score is :",
          silhouette_avg,
      )

      # Compute the silhouette scores for each sample
      sample_silhouette_values = silhouette_samples(X, cluster_labels)

      y_lower = 10
      for i in range(n_clusters):
          # Aggregate the silhouette scores for samples belonging to
          # cluster i, and sort them
          ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

          ith_cluster_silhouette_values.sort()

          size_cluster_i = ith_cluster_silhouette_values.shape[0]
          y_upper = y_lower + size_cluster_i

          color = cm.nipy_spectral(float(i) / n_clusters)
          ax1.fill_betweenx(
              np.arange(y_lower, y_upper),
              0,
              ith_cluster_silhouette_values,
              facecolor=color,
              edgecolor=color,
              alpha=0.7,
          )

          # Label the silhouette plots with their cluster numbers at the middle
          ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize=16)

          # Compute the new y_lower for next plot
          y_lower = y_upper + 10  # 10 for the 0 samples

      ax1.set_title("The silhouette plot for the various clusters.", fontsize=16)
      ax1.set_xlabel("The silhouette coefficient values", fontsize=20)
      ax1.set_ylabel("Cluster label", fontsize=20)

      # The vertical line for average silhouette score of all the values
      ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

      ax1.set_yticks([])  # Clear the yaxis labels / ticks
      ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
      ax1.set_xticklabels(ax1.get_xticks(), size=16)

      # 2nd Plot showing the actual clusters formed
      if not area_plot_only:
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(
            X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
        )

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(
            centers[:, 0],
            centers[:, 1],
            marker="o",
            c="white",
            alpha=1,
            s=200,
            edgecolor="k",
        )

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

      plt.suptitle(
          "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
          % n_clusters,
          fontsize=18,
          fontweight="bold",
      )

  plt.tight_layout()


  plt.savefig("silhouette.png",
            bbox_inches ="tight",
            pad_inches = 0.1,
            transparent = True,
            facecolor ="w",
            edgecolor ='w',
            orientation ='landscape')


  plt.show()

# 3.Import Datasets

In [None]:
# with zipfile.ZipFile('./listings.zip', 'r') as zip_ref:
#   try:
#     zip_ref.extractall('./')
#     if 'listings.csv' not in os.listdir('./'):
#         raise Exception('Failed to unzip files')
#     else:
#       print('Unzipped files')
#   except:
#     print('Failed to unzip files')

Unzipped files


In [None]:
# personal directory
# det_list_df = pd.read_csv('./listings.csv', dtype={'id':str}) 

# GitHub directory
det_list_df = pd.read_csv('../data/raw/AirBnB/Samples/listings.csv', dtype={'id':str}) 

In [None]:
amenities = det_list_df['amenities']

In [None]:
amenities.sample().values

array(['["First aid kit", "Oven", "Free street parking", "Fire extinguisher", "Wifi", "Hair dryer", "Shampoo", "Cooking basics", "Toaster", "Private entrance", "Microwave", "Dishwasher", "Long term stays allowed", "Extra pillows and blankets", "Refrigerator", "Heating", "Carbon monoxide alarm", "Hangers", "Essentials", "Hot water", "Dryer", "Freezer", "Keypad", "Iron", "Coffee maker", "Self check-in", "Paid parking off premises", "Washer", "Dedicated workspace", "Bed linens", "TV", "Stove", "Smoke alarm", "Pets allowed", "City skyline view", "Elevator", "Kitchen", "Air conditioning", "Dishes and silverware", "Blender", "Ceiling fan"]'],
      dtype=object)

In [None]:
# convert 'amenities' column from JSON Object to list
decoded_amenities = amenities.apply(lambda x: json.loads(x))

In [None]:
decoded_amenities.sample().values

array([list(['First aid kit', 'Free street parking', 'Free parking on premises', 'Fire extinguisher', 'Wifi', 'Hair dryer', 'Shampoo', 'Cooking basics', 'Private entrance', 'Breakfast', 'Microwave', 'Long term stays allowed', 'Refrigerator', 'Heating', 'Carbon monoxide alarm', 'Hangers', 'Essentials', 'Hot water', 'Freezer', 'Keypad', 'Iron', 'Self check-in', 'TV', 'Backyard', 'Smoke alarm', 'Kitchen', 'Air conditioning', 'Coffee maker: Keurig coffee machine', 'Dishes and silverware'])],
      dtype=object)

In [None]:
amenities_unique = set()

for amenities_listing in decoded_amenities:
  amenities_unique.update(set(amenities_listing))

In [None]:
print(len(amenities_unique))

3017


In [None]:
list(amenities_unique)[:50]

['Bluetooth speakers sound system with Bluetooth and aux',
 'Fast wifi – 806 Mbps',
 'Marshall Standmore II Bluetooth sound system',
 '32" HDTV with Apple TV',
 'Toaster Oven oven',
 'GE Cafe stainless steel gas stove',
 'Fast wifi – 718 Mbps',
 'Ginger Lily Farms shampoo',
 '60" HDTV with Apple TV, Roku, Fire TV, Amazon Prime Video, Hulu, Disney+, premium cable, Netflix',
 'Dryer –\xa0In unit',
 'Dedicated workspace',
 'HDTV with Chromecast, HBO Max, Hulu, Amazon Prime Video, Roku, Fire TV, Disney+, Netflix, Apple TV',
 '65" HDTV with Amazon Prime Video, HBO Max, Netflix, premium cable, Roku, Disney+, Hulu',
 'Plaine Products (zero-waste and vegan) shampoo',
 '43" HDTV with Amazon Prime Video, Apple TV, Netflix, standard cable',
 'Coffee maker: drip coffee maker, french press',
 '56" HDTV with Hulu, Netflix, DVD player, standard cable, Apple TV',
 'Wifi – 42 Mbps',
 'Fast wifi – 391 Mbps',
 'TV with standard cable, Apple TV',
 'TRADER JOES shampoo',
 'Fast wifi – 437 Mbps',
 'Game con

# 4.Generalize Amenities

## Resources
* **1. Amenity Categories**
  * [The amenities guests want by AirBnB](https://www.airbnb.com/resources/hosting-homes/a/the-amenities-guests-want-25)

* **2. Product Name Generalization Workflow**
  * [Product Clustering: A Text Clustering Approach](https://medium.com/moosend-engineering-data-science/product-clustering-a-text-clustering-approach-c392c2ef4310)
  * [How to Practice Word2Vec for NLP Using Python](https://builtin.com/machine-learning/nlp-word2vec-python)

* **3. Models from spaCy**
  * [Available trained pipelines for English](https://spacy.io/models/en)

* **4. Word Vectors and Semantic Similarity from spaCy**
  * [Word vectors and semantic similarity](https://spacy.io/usage/linguistic-features#vectors-similarity)

* **5. Named Entity Recognition**
  * [Named Entity Recognition from spaCy](https://spacy.io/usage/spacy-101#annotations-ner)
  * [Data Dictionary](https://www.kaggle.com/code/curiousprogrammer/entity-extraction-and-classification-using-spacy)

* **6. Text Preprocessing**
  * [Must Known Techniques for text preprocessing in NLP](https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/)
  * [NLP- Text Preprocessing Techniques](https://medium.com/swlh/nlp-text-preprocessing-techniques-ea34d3f84de4)

* **7. OHE Amenity Categories**
  * [OHE Amenities](https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list)

## 1.Named Entity Recognition (N.E.R) Tokens versus Tokens in spaCy

In [None]:
nlp_md = spacy.load('en_core_web_md')

In [None]:
nlp_lg = spacy.load('en_core_web_lg')

In [None]:
doc = nlp_lg('55" HDTV with Amazon Prime Video, Apple TV, Netflix')

In [None]:
# practice fetching entities with the doc.ents attribute
for ent in doc.ents:
    print(
        """
        text:{}
        start_char:{}
        end_char:{}
        label:{}
        """.format(ent.text, ent.start_char, ent.end_char, ent.label_))


        text:55
        start_char:0
        end_char:2
        label:CARDINAL
        

        text:Amazon Prime Video
        start_char:14
        end_char:32
        label:ORG
        

        text:Apple TV
        start_char:34
        end_char:42
        label:ORG
        

        text:Netflix
        start_char:44
        end_char:51
        label:ORG
        


In [None]:
# practice fetching entities with the doc.token attribute
for token in doc:
  print(
      """
      text: {}
      lemma_: {},
      ent_type_: {},
      pos_: {}
      tag_: {}
      dep_: {}
      shape_: {}
      is_alpha: {}
      is_stop: {}
      """.format(token.text, token.lemma_, token.ent_type_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop))


      text: 55
      lemma_: 55,
      ent_type_: CARDINAL,
      pos_: NUM
      tag_: CD
      dep_: nummod
      shape_: dd
      is_alpha: False
      is_stop: False
      

      text: "
      lemma_: ",
      ent_type_: ,
      pos_: PUNCT
      tag_: ``
      dep_: punct
      shape_: "
      is_alpha: False
      is_stop: False
      

      text: HDTV
      lemma_: hdtv,
      ent_type_: ,
      pos_: NOUN
      tag_: NN
      dep_: ROOT
      shape_: XXXX
      is_alpha: True
      is_stop: False
      

      text: with
      lemma_: with,
      ent_type_: ,
      pos_: ADP
      tag_: IN
      dep_: prep
      shape_: xxxx
      is_alpha: True
      is_stop: True
      

      text: Amazon
      lemma_: Amazon,
      ent_type_: ORG,
      pos_: PROPN
      tag_: NNP
      dep_: compound
      shape_: Xxxxx
      is_alpha: True
      is_stop: False
      

      text: Prime
      lemma_: Prime,
      ent_type_: ORG,
      pos_: PROPN
      tag_: NNP
      dep_: compound
   

## 2.Create Catalog of Amenities

In [None]:
amenity_index = list(range(1, len(amenities_unique)+1))
amenity_index_str = ['a'+str(index) for index in amenity_index]

In [None]:
amenities_catalog = dict(zip(amenity_index_str, sorted(amenities_unique)))

In [None]:
for key in list(amenities_catalog.keys())[0:70]:
  print('{}: {}'.format(key, amenities_catalog[key]))

a1:  GE stove
a2:  Small bluetooth speaker Bluetooth sound system
a3: "Everyone" and a local non-profit bar of soap. body soap
a4: "Everyone" shampoo
a5: 1 burner portable induction glass stove top induction stove
a6: 1" HDTV with Disney+, Hulu
a7: 1" HDTV with Disney+, Hulu, Roku
a8: 1/2 size in kitchenette with separate freezer and mini fridge in one bedroom refrigerator
a9: 100" TV with Roku, Hulu, Amazon Prime Video, Netflix, Apple TV, HBO Max
a10: 108" HDTV with Amazon Prime Video, Netflix, Roku, standard cable
a11: 120" HDTV with Disney+, Apple TV, Netflix, premium cable, Amazon Prime Video
a12: 19" TV
a13: 2 burner cooktop oven
a14: 2 burner glass induction stove top induction stove
a15: 2 burner glass top stove electric stove
a16: 2 burner induction glass stove top induction stove
a17: 2 burners induction stove
a18: 2 in 1 shampoo and conditioner conditioner
a19: 20" HDTV
a20: 20" HDTV with Fire TV
a21: 20" compact smoothtop electric stove  oven
a22: 21" HDTV with Roku, Netflix

## 3.Vectorize Amenities

In [None]:
import copy

amenities_catalog_copy = copy.deepcopy(amenities_catalog)

amenities_catalog = {}
for key in amenities_catalog_copy.keys():
  amenities_catalog[key]={'string':amenities_catalog_copy[key]}

In [None]:
for key in amenities_catalog.keys():
  amenities_catalog[key]['spacyVector']=spacy_vectorizer(amenities_catalog[key]['string'], nlp_lg)

In [None]:
for key in list(amenities_catalog.keys())[:5]:
  print('{}: {}\n'.format(key, amenities_catalog[key]))

a1: {'string': ' GE stove', 'spacyVector':  GE stove}

a2: {'string': ' Small bluetooth speaker Bluetooth sound system', 'spacyVector':  Small bluetooth speaker Bluetooth sound system}

a3: {'string': '"Everyone" and a local non-profit bar of soap. body soap', 'spacyVector': "Everyone" and a local non-profit bar of soap. body soap}

a4: {'string': '"Everyone" shampoo', 'spacyVector': "Everyone" shampoo}

a5: {'string': '1 burner portable induction glass stove top induction stove', 'spacyVector': 1 burner portable induction glass stove top induction stove}



## 4.Get ORG Type Recognized Entities

In [None]:
for key in amenities_catalog.keys():
  amenities_catalog[key]['entities'] = findEntities( amenities_catalog[key]['spacyVector'])

In [None]:
for amenitiy in list(amenities_catalog.values())[25:50]:
  print('{}\n'.format(amenitiy))

{'string': '25" HDTV with Netflix', 'spacyVector': 25" HDTV with Netflix, 'entities': [Netflix]}

{'string': '27" HDTV with Hulu, Netflix', 'spacyVector': 27" HDTV with Hulu, Netflix, 'entities': [Hulu, Netflix]}

{'string': '3 in 1 arm and hammer shower gel.shampo and conditioner shampoo', 'spacyVector': 3 in 1 arm and hammer shower gel.shampo and conditioner shampoo, 'entities': []}

{'string': '3 in 1 conditioner', 'spacyVector': 3 in 1 conditioner, 'entities': []}

{'string': '3 in 1 organic  conditioner', 'spacyVector': 3 in 1 organic  conditioner, 'entities': []}

{'string': '3 in 1 organic  shampoo', 'spacyVector': 3 in 1 organic  shampoo, 'entities': []}

{'string': '3 in 1 shampoo', 'spacyVector': 3 in 1 shampoo, 'entities': []}

{'string': '3 in 1. shower gel. shampoo.conditioner body soap', 'spacyVector': 3 in 1. shower gel. shampoo.conditioner body soap, 'entities': [shampoo.conditioner]}

{'string': '30" HDTV with Amazon Prime Video, Netflix, Roku, standard cable', 'spacyV

## 5.Manually Correct Any Falsely Identified Entities

In [None]:
# fetch all amenities that are tied to companies to vet errors
fin_entities = []
for key in list(amenities_catalog.keys()):
  entities = amenities_catalog[key]['entities']
  if entities:
    for entity in entities:
      fin_entities.append(entity.text)

In [None]:
len(set(fin_entities))

251

In [None]:
sorted(list(set(fin_entities)))

['AEG',
 'AESOP',
 'Alba',
 'Alba Botanica',
 'Alexa',
 'Alexa Bluetooth',
 'Alexa Echos',
 'All Natural',
 'Amana',
 'Amazon',
 'Amazon Brand',
 'Amazon Echo',
 'Amazon Echo w/',
 'Amazon Essentials Silky Body Wash - Peach',
 'Amazon Music Unlimited',
 'Amazon Prime Video',
 'Anker',
 'Antique GE',
 'Apple Home Pod Bluetooth',
 'Apple HomePod',
 'Apple Music',
 'Apple TV',
 'Argan Oil',
 'Argon',
 'Arm & Hammer 3',
 'Avalon Organics',
 'Aveda',
 'Aveeno',
 'B&D',
 'BATH &',
 'Backyard',
 'Barbecue',
 'Bars &',
 'Bath &',
 'Bath & Body',
 'Bath & Body Works',
 'Bath & Bodyworks',
 'Bath and Body Works',
 'Beauty & Planet',
 'Beekman Fresh Air',
 'Bertazzoni stainless steel',
 'Bio Botanics',
 'Bluetooth',
 'Bluetooth Speakers',
 'BodyWash & Bar Variety',
 'Bosch',
 'Bosch Convection  oven',
 'Bosch Mini Speed Oven stainless steel',
 'Bose',
 'Bose Sound Bar',
 'Bose SoundLink Color II',
 'Brand',
 'Brookstone Bluetooth',
 'Cafe GE',
 'Champagne Collection',
 'Chromecast',
 'Citrus+Aloe

In [None]:
error_ents = [
    'Backyard', 
    'Backyard - Fully', 
    'Barbecue', 
    'Bath &', 
    'Bathtub', 
    'Blender', 
    'Board', 
    'Children', 
    'Dishwasher', 
    'EV', 
    'Keypad', 
    'Microphone Bluetooth',
    'Mini', 
    'Outlet', 
    'Rudy\'s &', 
    'STAINLESS STEEL',
    'Stainless', 
    'Stainless Steel', 
    'Stainless-Steel', 
    'glasstops stainless steel electric']

In [None]:
# Correct Errors
for key in amenities_catalog.keys():
  amenities_catalog[key]['ents_fin'] = entityCorrections(amenities_catalog[key]['entities'], error_ents)

In [None]:
# sanity check
amenities_catalog['a50']

{'string': '32" HDTV with Fire TV, Hulu, Netflix',
 'spacyVector': 32" HDTV with Fire TV, Hulu, Netflix,
 'entities': [Fire TV, Hulu, Netflix],
 'ents_fin': [Fire TV, Hulu, Netflix]}

## 6.Remove Recognized Entities

In [None]:
for key in amenities_catalog.keys():
    new_amenity = removeEntities(amenities_catalog[key]['string'], amenities_catalog[key]['ents_fin'])
    amenities_catalog[key]['noEnt_str'] = new_amenity

In [None]:
# sanity check
amenities_catalog['a230']

{'string': '48" HDTV with Chromecast, Netflix, premium cable',
 'spacyVector': 48" HDTV with Chromecast, Netflix, premium cable,
 'entities': [Chromecast, Netflix],
 'ents_fin': [Chromecast, Netflix],
 'noEnt_str': '48" HDTV with  premium cable'}

In [None]:
current_set = []
for key in amenities_catalog.keys():
  current_set.append(amenities_catalog[key]['noEnt_str'])

In [None]:
len(set(current_set))

2283

## 7.Take Advantage of Available Structure and Remove Numbers or Units of Measurements

* Up to now, the primary focus was to identify and remove brands from product names. Next, any semi-structured amenity names which contain characters like (<font color='red'>**':', '-'**</font>) will be split to capture the keywords. Plus, any product names which contain numbers or units of measurements will be removed.

In [None]:
current_set = []
for key in amenities_catalog.keys():
  amenities_catalog[key]['noEnt_split_str'] = split_strucute(amenities_catalog[key]['noEnt_str'])
  current_set.append(amenities_catalog[key]['noEnt_split_str'])

In [None]:
len(set(current_set))

1663

In [None]:
current_set = []
for key in amenities_catalog.keys():
  amenities_catalog[key]['noEnt_split_noNum_str'] = remove_number(amenities_catalog[key]['noEnt_split_str'])
  current_set.append(amenities_catalog[key]['noEnt_split_noNum_str'])

In [None]:
len(set(current_set))

1382

## 8.Apply Typical Text Pre-Processing

In [None]:
# alphanumeric
for key in amenities_catalog.keys():
  amenities_catalog[key]['alnum'] = alphanumeric(amenities_catalog[key]['noEnt_split_noNum_str'])

In [None]:
# lowercase
for key in amenities_catalog.keys():
  amenities_catalog[key]['low'] = low_case(amenities_catalog[key]['alnum'])

In [None]:
# remove stopwords
for key in amenities_catalog.keys():
  amenities_catalog[key]['no_stopwords'] = remove_stopwords(amenities_catalog[key]['low'])

In [None]:
# stem
for key in amenities_catalog.keys():
  amenities_catalog[key]['stem'] = stem(amenities_catalog[key]['no_stopwords'])

In [None]:
# sanity check
amenities_catalog['a230']

{'string': '48" HDTV with Chromecast, Netflix, premium cable',
 'spacyVector': 48" HDTV with Chromecast, Netflix, premium cable,
 'entities': [Chromecast, Netflix],
 'ents_fin': [Chromecast, Netflix],
 'noEnt_str': '48" HDTV with  premium cable',
 'noEnt_split_str': '48" HDTV with  premium cable',
 'noEnt_split_noNum_str': '" HDTV with  premium cable',
 'alnum': 'HDTV with premium cable',
 'low': 'hdtv with premium cable',
 'no_stopwords': 'hdtv premium cable',
 'stem': 'hdtv premium cabl'}

In [None]:
# sanity check
amenities_catalog['a1500']

{'string': 'Fast wifi – 79 Mbps',
 'spacyVector': Fast wifi – 79 Mbps,
 'entities': [],
 'ents_fin': [],
 'noEnt_str': 'Fast wifi – 79 Mbps',
 'noEnt_split_str': 'Fast wifi',
 'noEnt_split_noNum_str': 'Fast wifi',
 'alnum': 'Fast wifi',
 'low': 'fast wifi',
 'no_stopwords': 'fast wifi',
 'stem': 'fast wifi'}

## 9.Try Semantic Similarity

### 9.1.Vectorize Cleaned Amenity Names

In [None]:
for key in amenities_catalog.keys():
  amenities_catalog[key]['word_vectors']=spacy_vectorizer(amenities_catalog[key]['stem'], nlp_lg).vector

### 9.2.Apply KMeans and Validate Clusters

In [None]:
# build matrix
X_spacy = None

for key in amenities_catalog.keys():
  if X_spacy is None:
    X_spacy = amenities_catalog[key]['word_vectors']
  else:
    X_spacy = np.vstack((X_spacy, amenities_catalog[key]['word_vectors']))

In [None]:
# sanity check
X_spacy.shape

(3017, 300)

#### Credits
* [Selecting the number of clusters with silhouette analysis on KMeans clustering](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html)
* [Stop Using Elbow Method in K-means Clustering, Instead, Use this!](https://towardsdatascience.com/elbow-method-is-not-sufficient-to-find-best-k-in-k-means-clustering-fc820da0631d#:~:text=In%20real%2Dworld%20datasets%2C%20you,of%20clusters%20for%20your%20dataset.) 

#### Supplemental Sources
* [How to Determine the Optimal K for K-Means?](https://towardsdatascience.com/elbow-method-is-not-sufficient-to-find-best-k-in-k-means-clustering-fc820da0631)

#### Takeaway
* **All sizes of K for KMeans fail to meet the following requirements. Therefore, converting each amenity into a semantic word vector with spaCy and clustering fails.**
  1. For a particular K, all the clusters should have a Silhouette score more than the average score of the dataset (represented by a red dotted line).
  2. There should not be wide fluctuations in the size of the clusters.

In [None]:
# find_silhouette(X_spacy, 20, 40, 2) -> None

## 10.Alternative: Try String Similarity with Bag of Words (BoW) and TF-IDF

### 10.1.Vectorize Cleaned Amenity Names

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def bow_vectorizer(X, min_n, max_n):
  count_vect = CountVectorizer(ngram_range=(min_n, max_n))
  X_train_counts = count_vect.fit_transform(X)

  return X_train_counts

def tfidf_vectorizer(X, min_n, max_n):
  tfidf_vect = TfidfVectorizer(use_idf=True, ngram_range=(min_n, max_n))
  X_train_tf = tfidf_vect.fit_transform(X)

  return X_train_tf

# strings
X_str = []
for key in amenities_catalog.keys():
  X_str.append(amenities_catalog[key]['stem'])


In [None]:
X_bow = bow_vectorizer(X_str, 1, 3)

In [None]:
X_tf = tfidf_vectorizer(X_str, 1, 3)

#### Takeaway
* **All sizes of K for KMeans fail to meet the following requirements mentioned above. But, the TF-IDF Vectorization technique appears to yield better results than the Bag of Words Vectorization technique.**

#### Bag of Words (Unigram, Bigram, and Trigram)

In [None]:
# find_silhouette(X_bow.toarray(), 20, 40, 2) # best K -> None

#### TF-IDF

In [None]:
# find_silhouette(X_tf.toarray(), 20, 40, 2) # best K -> 15

## 11.Alternative: Try Cosine Similarity

### 11.1.Find Cosine Similarity Among Vectorized Amenities

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

X_spacy_cos = cosine_similarity(X_spacy)
X_bow_cos = cosine_similarity(X_bow)
X_tf_cos = cosine_similarity(X_tf)

#### Takeaway
* **The Bag of Words Vectorization technique at K=16 appears to outperform the other vectorization techniques.**

In [None]:
# find_silhouette(X_spacy_cos, 20, 40, 2) # best K -> 18

In [None]:
# find_silhouette(X_bow_cos, 20, 40, 2) # best K -> 16

In [None]:
# find_silhouette(X_tf_cos, 20, 40, 2) # best K -> 12

## 12.Alternative: PCA on Winning Vectorization Technique

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

def dim_red_PCA(X, n_components=0.85):
  pca = PCA(n_components=n_components, random_state=42)
  X_pca = pca.fit_transform(X)

  return X_pca

def dim_red_truncatedSVD(X, n_components=30):
  svd = TruncatedSVD(n_components=n_components, random_state=42)
  U = svd.fit_transform(X)
  Sigma = svd.singular_values_
  VT = svd.components_

  return U


In [None]:
X_pca = dim_red_PCA(X_bow_cos)

In [None]:
X_svd = dim_red_truncatedSVD(X_bow_cos, 30)

In [None]:
# find_silhouette(X_pca, 15, 40, 1) # best K -> 33 at 95% variance, 25 at 70% variance, 32 at 85% variance

In [None]:
# find_silhouette(X_svd, 15, 40, 1) # best K -> 29 # WINNING VECTORIZATION TECHNIQUE!!!
find_silhouette(X_svd, 29, 29, 1, True) # best K -> 29 # WINNING VECTORIZATION TECHNIQUE!!!

## 13.Export Winning Results

In [None]:
kmeans = KMeans(n_clusters=29, n_init="auto", random_state=42).fit(X_svd)

for index in range(len(kmeans.labels_)):
  key = 'a'+str(index+1)
  amenities_catalog[key]['cluster'] = kmeans.labels_[index]

amenities = []

for key in amenities_catalog.keys():
  amenity = []
  amenity.append(key)
  amenity.append(amenities_catalog[key]['string'])
  amenity.append(amenities_catalog[key]['stem'])
  amenity.append(amenities_catalog[key]['cluster'])
  amenities.append(amenity)

df = pd.DataFrame(data=amenities, columns=['a_id', 'string', 'stem', 'cluster'])
sorted_df = df.sort_values(by=['cluster', 'string'], ascending=[True, True])

In [None]:
sorted_df.head(25)

Unnamed: 0,a_id,string,stem,cluster
28,a29,3 in 1 conditioner,condition,0
755,a756,Alba Botanica conditioner,condition,0
776,a777,Amazon Brand conditioner,condition,0
809,a810,Avalon Organics conditioner,condition,0
812,a813,Aveda conditioner,condition,0
840,a841,Bath & Body Works conditioner,condition,0
843,a844,Bath & Body Works conditioner,condition,0
850,a851,Bath and Body Works conditioner,condition,0
853,a854,Bath and Body Works conditioner,condition,0
860,a861,Beauty & Planet (sulfite free) conditioner,condition,0


In [None]:
# # uncomment to export information if needed
# sorted_df.to_csv(path_or_buf='../data/interim/nlp_bow_trigram_31_svd_30.csv', index=False)

# 14.Get Main Topic per Cluster

In [None]:
# #uncomment to delete key if present
# for key in amenities_catalog.keys():
#   del amenities_catalog[key]['word_vectors']

In [None]:
for key in amenities_catalog.keys():
  amenities_catalog[key]['cluster']=sorted_df[sorted_df['a_id']==key]['cluster'].values[0]

In [None]:
amenities_catalog['a841']

{'string': 'Bath & Body Works  conditioner',
 'spacyVector': Bath & Body Works  conditioner,
 'entities': [Bath & Body Works],
 'ents_fin': [Bath & Body Works],
 'noEnt_str': 'conditioner',
 'noEnt_split_str': 'conditioner',
 'noEnt_split_noNum_str': 'conditioner',
 'alnum': 'conditioner',
 'low': 'conditioner',
 'no_stopwords': 'conditioner',
 'stem': 'condition',
 'cluster': 0}

In [None]:
from collections import Counter

for cluster in sorted_df['cluster'].unique():
  filtered_df = sorted_df[sorted_df['cluster']==cluster]

  amenities = []
  for amenity in filtered_df['a_id'].values:
    amenities.append(amenities_catalog[amenity]['stem'])

  counter = Counter(amenities)

  print('Cluster: {}'.format(cluster))
  for item in counter.most_common():
    print(item)
  print()

Cluster: 0
('condition', 53)
('h condition', 1)
('condition condition', 1)

Cluster: 1
('hdtv', 310)

Cluster: 2
('fast wifi', 289)

Cluster: 3
('refriger', 24)
('kitchen aid refriger', 3)
('standard refriger', 3)
('mini fridg refriger', 2)
('frigidair refriger', 2)
('kenmor refriger', 2)
('whirlpool refriger', 2)
('apart refriger', 1)
('apart size refriger', 1)
('bertazzoni refriger', 1)
('boko refriger', 1)
('bottom mount refriger', 1)
('new refriger refriger', 1)
('dedic guest refriger', 1)
('e refriger', 1)
('filter water avail refriger', 1)
('fridgidair refriger', 1)
('frigidari refriger', 1)
('full size refriger', 1)
('applianc refriger', 1)
('addit drawer refriger space refriger', 1)
('ice water door refriger', 1)
('side side refriger', 1)
('gallanz refriger', 1)
('ge refriger', 1)
('idk refriger', 1)
('jen air refriger', 1)
('ken refriger', 1)
('kenmor elit refriger', 1)
('kitchenaid refriger', 1)
('french door refriger', 1)
('stainless refriger', 1)
('magic chef refriger', 1)


# 15.Build Cluster Data Dictionary

In [None]:
clusters = {
    0: 'Conditioner',
    1: 'TV',
    2: 'WiFi',
    3: 'Refrigerator',
    4: 'TV',
    
    5: 'TV',
    6: 'TV',
    7: 'Body Soap',
    8: 'Shampoo',
    9: 'TV',

    10: 'Sound System',
    11: 'Stove',
    12: 'Pool',
    13: 'Miscellaneous',
    14: 'Body Soap',

    15: 'TV',
    16: 'TV',
    17: 'Coffee Maker',
    18: 'Clothing Storage',
    19: 'Conditioner',

    20: 'TV',
    21: 'Oven',
    22: 'WiFi',
    23: 'Shampoo',
    24: 'TV',
    25: 'TV',

    26:'Stove',
    27:'TV',
    28:'Exercise Equipment',
}


# previous clusters
# clusters = {
#     0: 'Conditioner',
#     1: 'TV',
#     2: 'Wi-Fi',
#     3: 'Backyard or Swimming Pool',
#     4: 'TV',
    
#     5: 'TV',
#     6: 'Shampoo',
#     7: 'Body Soap',
#     8: 'TV',
#     9: 'Swimming Pool',

#     10: 'TV',
#     11: 'Sound System',
#     12: 'Stove',
#     13: 'TV',
#     14: 'Clothing Storage',

#     15: 'Shampoo',
#     16: 'TV',
#     17: 'Conditioner',
#     18: 'Wi-Fi',
#     19: 'TV',

#     20: 'TV',
#     21: 'Refrigerator',
#     22: 'Oven',
#     23: 'Coffee Maker',
#     24: 'TV',
#     25: 'Stove',

#     26: 'TV',
#     27: 'Body Soap',
#     28: 'Free Parking',
#     29: 'Miscellaneous',
#     30: 'Exercise Equipment',

# }

In [None]:
grouped_clusters = dict()

for key in clusters.keys():
  if clusters[key] in grouped_clusters:
    grouped_clusters[clusters[key]].append(key)
  else:
    grouped_clusters[clusters[key]] = [key]

In [None]:
grouped_clusters

{'Conditioner': [0, 19],
 'TV': [1, 4, 5, 6, 9, 15, 16, 20, 24, 25, 27],
 'WiFi': [2, 22],
 'Refrigerator': [3],
 'Body Soap': [7, 14],
 'Shampoo': [8, 23],
 'Sound System': [10],
 'Stove': [11, 26],
 'Pool': [12],
 'Miscellaneous': [13],
 'Coffee Maker': [17],
 'Clothing Storage': [18],
 'Oven': [21],
 'Exercise Equipment': [28]}

In [None]:
amenity_cluster_mapper = dict()

for key in grouped_clusters.keys():
  amenity_cluster_mapper[key] = sorted_df[sorted_df['cluster'].isin(grouped_clusters[key])]['a_id'].values

In [None]:
amenity_cluster_mapper_reverse = dict()
for key in amenity_cluster_mapper.keys():
  for a_id in amenity_cluster_mapper[key]:
    amenity_cluster_mapper_reverse[a_id]=key

In [None]:
# sanity check
amenity_cluster_mapper['Exercise Equipment']

array(['a1203', 'a1204', 'a1205', 'a1206', 'a1207', 'a1208', 'a1209',
       'a1210', 'a1211', 'a1212', 'a1213', 'a1214', 'a1215', 'a1216',
       'a1217', 'a1218', 'a1219', 'a1220', 'a1221', 'a1222', 'a1223',
       'a1224', 'a1225', 'a1226', 'a1227', 'a1228', 'a1229', 'a1230'],
      dtype=object)

In [None]:
# sanity check
amenity_cluster_mapper_reverse['a1203']

'Exercise Equipment'

In [None]:
# sanity check
amenities_catalog['a1203']

{'string': 'Exercise equipment',
 'spacyVector': Exercise equipment,
 'entities': [],
 'ents_fin': [],
 'noEnt_str': 'Exercise equipment',
 'noEnt_split_str': 'Exercise equipment',
 'noEnt_split_noNum_str': 'Exercise equipment',
 'alnum': 'Exercise equipment',
 'low': 'exercise equipment',
 'no_stopwords': 'exercise equipment',
 'stem': 'exercis equip',
 'cluster': 28}

# 16.Map Generalized Clusters to AirBnB Listings Dataset

In [None]:
det_list_copy_df = det_list_df.copy(deep=True)
det_list_copy_df = det_list_copy_df[['id', 'amenities']]

In [None]:
det_list_copy_df.head()

Unnamed: 0,id,amenities
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ..."
1,59576,"[""Books and reading material"", ""First aid kit""..."
2,72906,"[""First aid kit"", ""Free street parking"", ""Fire..."
3,319705,"[""Oven"", ""Free street parking"", ""Free parking ..."
4,289242,"[""Oven"", ""Free street parking"", ""Free parking ..."


In [None]:
det_list_copy_df.shape

(8127, 2)

In [None]:
# decode
det_list_copy_df['amenities_decoded'] = det_list_copy_df['amenities'].apply(lambda x: json.loads(x))

In [None]:
# explode
det_list_copy_df = det_list_copy_df.explode('amenities_decoded')

In [None]:
det_list_copy_df.head()

Unnamed: 0,id,amenities,amenities_decoded
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Coffee maker: pour-over coffee
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",First aid kit
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Free street parking
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Fire extinguisher
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Wifi


In [None]:
det_list_copy_df.shape

(308094, 3)

## 16.1 Map Amenity Name in the AirBnB Listings Dataset to Amenity ID


In [None]:
# make the amenity name the key and a_id the value
amenities_catalog_reverse = {value['string']: key for key, value in amenities_catalog.items()}

In [None]:
amenities_catalog_reverse['EV charger']

'a1166'

In [None]:
len(amenities_catalog_reverse)

3017

In [None]:
# map
def a_id_mapper(row):
  a_id=None  
  try:
    a_id = amenities_catalog_reverse[row['amenities_decoded']]
  except:
    a_id = np.nan

  return a_id

det_list_copy_df['a_id'] = det_list_copy_df.apply(lambda x: a_id_mapper(x), axis=1)

In [None]:
det_list_copy_df.head()

Unnamed: 0,id,amenities,amenities_decoded,a_id
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Coffee maker: pour-over coffee,a1080
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",First aid kit,a1527
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Free street parking,a1575
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Fire extinguisher,a1524
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Wifi,a2960


In [None]:
det_list_copy_df[det_list_copy_df.isnull().any(axis=1)]

Unnamed: 0,id,amenities,amenities_decoded,a_id
1520,23145089,[],,


## 16.2 Map Amenity ID to Generalized Cluster

In [None]:
def a_id_cluster_mappers(row):
  amenity_general = None
  try:
      amenity_general = amenity_cluster_mapper_reverse[row['a_id']]
  except:
    amenity_general = np.nan

  return amenity_general

det_list_copy_df['amenity'] = det_list_copy_df.apply(lambda x: a_id_cluster_mappers(x), axis=1)

In [None]:
det_list_copy_df.head()

Unnamed: 0,id,amenities,amenities_decoded,a_id,amenity
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Coffee maker: pour-over coffee,a1080,Coffee Maker
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",First aid kit,a1527,Miscellaneous
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Free street parking,a1575,Miscellaneous
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Fire extinguisher,a1524,Miscellaneous
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Wifi,a2960,WiFi


## 16.3.Breakout Miscellaneous (Catch-all Cluster)

### 16.3.1 Apply Affinity Propogation for Further Clustering and Get Counts of Miscellaneous Amenity Types

In [None]:
misc_df = sorted_df[sorted_df['cluster']==13]

In [None]:
misc_df.head()

Unnamed: 0,a_id,string,stem,cluster
7,a8,1/2 size in kitchenette with separate freezer ...,size kitchenett separ freezer mini fridg one b...,13
12,a13,2 burner cooktop oven,burner cooktop oven,13
20,a21,"20"" compact smoothtop electric stove oven",compactstov oven,13
172,a173,"42"" TV with Roku, DVD player",tv dvd player,13
745,a746,A Variety conditioner,acondition,13


In [None]:
## Train new vectorizer focused on miscellaneous cluster
X_bow = bow_vectorizer(misc_df['stem'], 1, 3)
X_bow_cos = cosine_similarity(X_bow)
X_svd = dim_red_truncatedSVD(X_bow_cos, 30)

In [None]:
from sklearn.cluster import AffinityPropagation

clustering = AffinityPropagation(random_state=42, damping=.65)
clustering.fit(X_svd)

In [None]:
df = pd.DataFrame(clustering.labels_, columns=['new_label'])

In [None]:
misc_ap_clustered_df = misc_df.reset_index().join(df).sort_values('new_label')

In [None]:
misc_ap_clustered_df.head()

Unnamed: 0,index,a_id,string,stem,cluster,new_label
17,823,a824,BBQ grill: wood-burning,bbq grill,13,0
16,822,a823,BBQ grill: gas,bbq grill,13,0
15,821,a822,BBQ grill: electric,bbq grill,13,0
14,820,a821,"BBQ grill: charcoal, wood-burning",bbq grill,13,0
13,819,a820,"BBQ grill: charcoal, gas",bbq grill,13,0


In [None]:
counts_df = det_list_copy_df['amenities_decoded'].value_counts(dropna=False).reset_index()
counts_filtered_df = counts_df[counts_df['index'].isin(misc_ap_clustered_df['string'].values)]
counts_filtered_df = counts_filtered_df.rename(columns={'index':'string', 'amenities_decoded':'count'})

In [None]:
misc_counts_ap_df = misc_ap_clustered_df.merge(counts_filtered_df, how='left', on='string')

In [None]:
misc_counts_ap_df.head()

Unnamed: 0,index,a_id,string,stem,cluster,new_label,count
0,823,a824,BBQ grill: wood-burning,bbq grill,13,0,4
1,822,a823,BBQ grill: gas,bbq grill,13,0,236
2,821,a822,BBQ grill: electric,bbq grill,13,0,10
3,820,a821,"BBQ grill: charcoal, wood-burning",bbq grill,13,0,1
4,819,a820,"BBQ grill: charcoal, gas",bbq grill,13,0,11


In [None]:
# export and manually label newly formed clusters

# personal directory
# misc_counts_ap_df.to_csv(path_or_buf='./misc_counts_ap_df.csv', index=False)

# GitHub directory
misc_counts_ap_df.to_csv(path_or_buf='../data/interim/misc_counts_ap_df.csv', index=False)

### 16.3.2. Manually Discard Remaining Entities and Apply Labels to Clusters

In [None]:
# import manually labeled and newly formed clusters

# personal directory
# misc_clusters_labeled_df = pd.read_csv('./misc_counts_ap_labeled_df.csv')

# GitHub directory
misc_clusters_labeled_df = pd.read_csv('../data/interim/misc_counts_ap_labeled_df.csv')

In [None]:
misc_clusters_labeled_filtered_df = misc_clusters_labeled_df[misc_clusters_labeled_df['keep']==1]

In [None]:
misc_clusters_labeled_filtered_df.head()

Unnamed: 0,index,a_id,string,stem,cluster,new_label,count,keep,new_cluster
0,823,a824,BBQ grill: wood-burning,bbq grill,13,0,4,1.0,BBQ
1,822,a823,BBQ grill: gas,bbq grill,13,0,236,1.0,BBQ
2,821,a822,BBQ grill: electric,bbq grill,13,0,10,1.0,BBQ
3,820,a821,"BBQ grill: charcoal, wood-burning",bbq grill,13,0,1,1.0,BBQ
4,819,a820,"BBQ grill: charcoal, gas",bbq grill,13,0,11,1.0,BBQ


In [None]:
grouped_misc = misc_clusters_labeled_filtered_df.groupby('new_cluster')['a_id'].apply(list).reset_index()

In [None]:
misc_dict = {}

for index, row in grouped_misc.iterrows():
  misc_dict[row['new_cluster']]=row['a_id']

In [None]:
misc_dict_reverse = dict()

for key in misc_dict.keys():
  for a_id in misc_dict[key]:
    misc_dict_reverse[a_id]=key

In [None]:
# sanity check
misc_dict['BBQ']

['a824', 'a823', 'a822', 'a821', 'a820', 'a819', 'a818']

In [None]:
# sanity check
misc_dict_reverse['a820']

'BBQ'

In [None]:
# replace values in main df
def misc_mapper(row, dict_rev):
  amenity_general = None
  try:
      amenity_general = dict_rev[row['a_id']]
  except:
    amenity_general = row['amenity'] # Miscellaneous will be a placeholder to discard if remaining later

  return amenity_general

det_list_copy_df['amenity'] = det_list_copy_df.apply(lambda x: misc_mapper(x, misc_dict_reverse), axis=1)

In [None]:
det_list_copy_df.head()

Unnamed: 0,id,amenities,amenities_decoded,a_id,amenity
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Coffee maker: pour-over coffee,a1080,Coffee Maker
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",First aid kit,a1527,First Aid Kit
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Free street parking,a1575,Free Parking
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Fire extinguisher,a1524,Fire Extinguisher
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Wifi,a2960,WiFi


In [None]:
det_list_copy_df.shape

(308094, 5)

In [None]:
det_list_copy_df[det_list_copy_df['amenity']=='BBQ'].sample()

Unnamed: 0,id,amenities,amenities_decoded,a_id,amenity
4301,50728773,"[""First aid kit"", ""Oven"", ""Shared outdoor pool...",BBQ grill,a818,BBQ


In [None]:
# drop miscellaneous rows
det_list_final_df = det_list_copy_df[det_list_copy_df['amenity']!='Miscellaneous']

In [None]:
det_list_final_df.head()

Unnamed: 0,id,amenities,amenities_decoded,a_id,amenity
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Coffee maker: pour-over coffee,a1080,Coffee Maker
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",First aid kit,a1527,First Aid Kit
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Free street parking,a1575,Free Parking
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Fire extinguisher,a1524,Fire Extinguisher
0,6422,"[""Coffee maker: pour-over coffee"", ""First aid ...",Wifi,a2960,WiFi


In [None]:
det_list_final_df.shape

(281823, 5)

# 17.OHE Amenities

In [None]:
final_df_copy = det_list_final_df.copy(deep=True)

In [None]:
columns = det_list_final_df['amenity'].value_counts().index.values
columns

array(['Free Parking', 'Coffee Maker', 'Patio or Balcony', 'WiFi',
       'Smoke Alarm', 'Air Conditioning', 'TV', 'Heating', 'Essentials',
       'Kitchen', 'Hair Dryer', 'Long Term Stays Allowed', 'Iron',
       'Hangers', 'Shampoo', 'Washer', 'Refrigerator', 'Diningware',
       'Hot Water', 'Microwave', 'Fire Extinguisher', 'Dryer',
       'Carbon Monoxide Alarm', 'Bed Linens', 'Self Check-in', 'Oven',
       'Cooking Basics', 'Stove', 'Dishwasher', 'Private Entrance',
       'First Aid Kit', 'Extra Pillows and Blankets',
       'Dedicated Workspace', 'Surveillance', 'Freezer', 'Body Soap',
       'Backyard', 'Clothing Storage', 'Wine Glasses', 'Conditioner',
       'Toaster', 'Cleaning Products', 'Dining Table', 'Keypad', 'BBQ',
       'Shades', 'Luggage Dropoff Allowed', 'Smart Lock', 'Pets Allowed',
       'Baby Crib', 'Pool', 'Gym', 'Blender', 'Fire Pit', 'Elevator',
       'Lockbox', 'Fireplace', 'Paid Parking', 'Laundromat Nearby',
       'Exercise Equipment', 'City Skyline V

In [None]:
grouped_final_df_copy = final_df_copy.groupby('id')['amenity'].apply(list).reset_index()

In [None]:
OHE_listings = []

for listing in grouped_final_df_copy['id'].values:
  amenities = grouped_final_df_copy[grouped_final_df_copy['id']==listing]['amenity'].values[0]

  OHE_amenities = []
  for column in columns:
    if column in amenities:
      OHE_amenities.append(1)
    else:
      OHE_amenities.append(0)

  OHE_listings.append(OHE_amenities)


In [None]:
listing_ids_df = pd.DataFrame(grouped_final_df_copy['id'].values, columns=['id'])
OHE_listings_df = pd.DataFrame(OHE_listings, columns=columns)

In [None]:
final_df = listing_ids_df.join(OHE_listings_df)

In [None]:
final_df.shape

(8127, 82)

In [None]:
sample = final_df.sample()
sample_index = sample.index

In [None]:
sample

Unnamed: 0,id,Free Parking,Coffee Maker,Patio or Balcony,WiFi,Smoke Alarm,Air Conditioning,TV,Heating,Essentials,...,Park View,Resort View,Mountain View,Valley View,Lake View,Golf Course View,Marina View,Canal View,Bay View,Vineyard View
3868,49566982,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
grouped_final_df_copy.iloc[sample_index]['amenity'].values[0]

['Oven',
 'Free Parking',
 'Free Parking',
 'Fire Extinguisher',
 'WiFi',
 'Conditioner',
 'Hair Dryer',
 'Body Soap',
 'Air Conditioning',
 'Shampoo',
 'Cooking Basics',
 'Toaster',
 'Private Entrance',
 'Washer',
 'Microwave',
 'Dishwasher',
 'Long Term Stays Allowed',
 'Extra Pillows and Blankets',
 'Heating',
 'Carbon Monoxide Alarm',
 'Hangers',
 'Shades',
 'Clothing Storage',
 'Fireplace',
 'Essentials',
 'Wine Glasses',
 'Dryer',
 'Freezer',
 'Diningware',
 'Iron',
 'Self Check-in',
 'Dedicated Workspace',
 'Coffee Maker',
 'Bed Linens',
 'TV',
 'Stove',
 'Smoke Alarm',
 'City Skyline View',
 'Kitchen',
 'Smart Lock']

In [None]:
# personal directory
# final_df.to_csv(path_or_buf='./OHE_amenities_nashville.csv', index=False)

# GitHub directory
final_df.to_csv(path_or_buf='../data/processed/OHE_amenities_nashville.csv', index=False)

In [None]:
# Final results -> 28 homogeneous clusters after round 1 of clustering, 53 homogeneous clusters after round 2 of clustering, 81 total clusters