# Use flashtext to parse rich labels for a list of features per listing.

In [1]:
import pandas as pd
from flashtext.keyword import KeywordProcessor
from nltk.corpus import stopwords

df = pd.read_csv('../data/rich_labels_file.csv')

In [2]:
df.head()

Unnamed: 0,id,Bedroom Info,Bathroom Info,Rooms Info,Interior Info,Exterior Info,Property Info,Laundry Info,Fireplace Info,Flooring Info,Kitchen+Dining Info,Living Room Info
0,0,# of Bedrooms (Above Grade): 3 # of Bedrooms ...,# of Baths (Full): 3 # of Baths (1/2): 1 # o...,Main Level Laundry # of Rooms (Above-Ground): ...,# of Fireplaces: 1 Living Has Fireplace Lower...,"Deck(s) Exterior Material: Brick, Vinyl Roofi...","Additional Items: Ceiling Fans, Humidifier, W...",,,,,
1,2,# of Beds (Main Level): 3 Has Master on Main ...,,# of Dining Rooms: 1 # of Living Rooms: 1 Di...,"Floors: Tile - Hard, Wood Vaulted Ceiling, In...","Patio-Covered, Private BackYard, Sidewalk Fenced",Extraterritorial Jurisdiction: Has Extraterri...,Utility/Laundry Room,# of Fireplaces: 1 Fireplace Features: In Fa...,,,
2,4,,# of Full Bathrooms: 2,"Room Count: 7 Additional Rooms: Attic, Flori...","Appliances: Dishwasher, Disposal, Dryer, Elec...","Exterior Features: French Doors, Rain Gutters...","Living Area: 1,820 Living Area Units: Square...",,,"Flooring: Engineered Hardwood, Tile",,
3,5,# of Bedrooms On 2nd Upper Level: 2 # of Bedr...,# of Bathrooms (Full) On 2nd Upper Level: 2 #...,"Living Room, Kitchen, Den, Bedroom 1, Study La...","Not Furnished Interior Features: Elevator, Wo...",,Not Federal Flood Zone Total Below Grade Sq. F...,,Has Fireplace # of Fireplaces: 1 Gas/Propane,,,
4,7,# of Bedrooms On 1st Upper Level: 4 # of Bedr...,# of Bathrooms (Full) On 1st Upper Level: 2 #...,"Dining Room, Master Bedroom, Bedroom 2, Bedroo...","Appliances: Intercom, Built-In Range, Dishwas...",Hot Tub Fencing: Invisible,Not Federal Flood Zone Total Below Grade Sq. F...,,# of Fireplaces: 1 Wood,,,


In [3]:
df.shape

(16419, 12)

In [4]:
relevant_features = ['fireplace', 
                    'pool',
                    'granite',
                    'marble',
                    'quartz',
                    'laminate',
                    'carpet',
                    'wood',
                    'washer',
                    'dryer',
                    'vaulted',
                    'high ceiling',
                    'patio',
                     'deck',
                     'island',
                     'office',
                     'bar',
                     'walk',
                     'pantry',
                     'ceiling fan',
                     'sky',
                     'hot tub',
                     'jacuzzi',
                     'jets',
                     'spa',
                     'sauna'
                    ]

In [5]:
list_of_packages = relevant_features
keyword_processor = KeywordProcessor()

keyword_processor.add_keywords_from_list(list_of_packages)

In [6]:
df.columns

Index(['id', 'Bedroom Info', 'Bathroom Info', 'Rooms Info', 'Interior Info',
       'Exterior Info', 'Property Info', 'Laundry Info', 'Fireplace Info',
       'Flooring Info', 'Kitchen+Dining Info', 'Living Room Info'],
      dtype='object')

In [7]:
cols = df.columns.tolist()[1:]
df = df.fillna('')
rows = {}
for x in df.iterrows():
    new = {}
    for col in cols:
        keywords_found = keyword_processor.extract_keywords(x[1][col])
        new[col] = keywords_found
    rows[x[1]['id']] = new
    
new_df = pd.DataFrame(rows)

In [8]:
new_df = new_df.T

In [9]:
new_df.columns = cols

In [10]:
# new_df.to_pickle('../data/labels_room_type.pkl')
new_df.to_csv('../data/labels_room_type.csv')

In [11]:
new_df.head()

Unnamed: 0,Bedroom Info,Bathroom Info,Rooms Info,Interior Info,Exterior Info,Property Info,Laundry Info,Fireplace Info,Flooring Info,Kitchen+Dining Info,Living Room Info
0,[],[],[deck],[],[],"[fireplace, dryer, washer]",[],[],[],[bar],"[pantry, bar]"
2,[],"[walk, walk]",[patio],[fireplace],[],"[wood, vaulted, walk]",[],[],[],[],"[granite, marble, walk, pantry]"
4,[],[],"[patio, patio]",[],[],"[dryer, washer, wood, walk]",[],[],[],[],[]
5,[],[],[],[fireplace],[],[wood],[],[],[],[],[]
7,[],[],[hot tub],[wood],[],"[island, spa]",[],[],[],[],[]


In [12]:
new_df.shape

(16419, 11)

### Download CSV
https://drive.google.com/open?id=1sRc3h-UlNau-rI8XC8gt9U-p-Ff4_gI3