In [106]:
import pandas as pd 
import os
import re

from nltk.stem import PorterStemmer
from matplotlib import pyplot as plt
from collections import defaultdict, Counter

In [2]:
BASE_PATH = os.path.join(os.getcwd(), os.pardir)
DATA_PATH = os.path.join(BASE_PATH, 'data')

In [3]:
train_df = pd.read_json(os.path.join(DATA_PATH, 'train.json'))

# Extract text/numerical/categorical features

Let's extract out hte number of bathrooms and bedrooms, the building ID, the latitude, the longitude, the manager ID and the price since these are trivial to extract

In [4]:
bathrooms = train_df['bathrooms']
bedrooms = train_df['bedrooms']
building_ids = train_df['building_id']
latitudes = train_df['latitude']
longitudes = train_df['longitude']
manager_ids = train_df['manager_id']
prices = train_df['price']

Now we can extract the month, day, hour and weekday from the creation timestamp

In [5]:
datetime = pd.to_datetime(train_df['created'])

In [6]:
months = datetime.dt.month
days = datetime.dt.day
hours = datetime.dt.hour

In [7]:
# Where Monday = 0, and Sunday = 6
weekdays = datetime.dt.dayofweek

Now we can extract the number of pictures for each listing

In [8]:
num_photos = train_df['photos'].str.len()

Now let's extract the 'features' column, which is a harder task

Methodology of: https://www.kaggle.com/jxnlco/deduplicating-features

In [9]:
features = train_df['features'].apply(lambda x: [i.lower() for i in x])

We eliminate all features that appear less than $n$ times

In [10]:
n = 5

feature_counts = Counter()
for feature in features.tolist():
    feature_counts.update(feature)
feature = sorted([k for (k,v) in feature_counts.items() if v > n])
feature[:10]

['24/7 concierge',
 '24/7 doorman',
 '24hr doorman',
 'a full service luxury highrise',
 'actual apt. photos',
 'air conditioning',
 'all utilities included',
 'assigned-parking-space',
 'attended lobby',
 'backyard']

Since we have a lot of duplicates, we can clean things up to eliminate them. We use the first 4 characters of a feature in order to group together different features.

In [11]:
def clean(s):
    x = s.replace("-", "")
    x = x.replace(" ", "")
    x = x.replace("twenty four hour", "24")
    x = x.replace("24/7", "24")
    x = x.replace("24hr", "24")
    x = x.replace("24-hour", "24")
    x = x.replace("24hour", "24")
    x = x.replace("24 hour", "24")
    x = x.replace("common", "cm")
    x = x.replace("concierge", "doorman")
    x = x.replace("bicycle", "bike")
    x = x.replace("private", "pv")
    x = x.replace("deco", "dc")
    x = x.replace("decorative", "dc")
    x = x.replace("onsite", "os")
    x = x.replace("outdoor", "od")
    x = x.replace("ss appliances", "stainless")
    return x

def feature_hash(x):
    cleaned = clean(x, uniq)
    key = cleaned[:4].strip()
    return key

In [12]:
key2original = defaultdict(list)
k = 4
for f in feature:
    cleaned = clean(f)
    key = cleaned[:k].strip()
    key2original[key].append(f)

In [18]:
key2original

defaultdict(list,
            {'24do': ['24/7 concierge', '24/7 doorman', '24hr doorman'],
             'aful': ['a full service luxury highrise'],
             'actu': ['actual apt. photos'],
             'airc': ['air conditioning'],
             'allu': ['all utilities included'],
             'assi': ['assigned-parking-space'],
             'atte': ['attended lobby'],
             'back': ['backyard'],
             'balc': ['balcony'],
             'base': ['basement storage'],
             'bike': ['bike room', 'bike storage'],
             'bill': ['billiards room'],
             'brea': ['breakfast bar'],
             'brow': ['brownstone'],
             'buil': ['building-common-outdoor-space'],
             'busi': ['business center'],
             'cabl': ['cable/satellite tv'],
             'cats': ['cats allowed'],
             'cent': ['central a/c', 'central ac', 'central air'],
             'chef': ['chefs kitchen'],
             'chil': ["children's playroom", 'children

In [56]:
columns = list(key2original.keys())

In [23]:
original2key = {}
for col in columns:
    for original in key2original[col]:
        original2key[original] = col

In [24]:
original2key

{'24/7 concierge': '24do',
 '24/7 doorman': '24do',
 '24hr doorman': '24do',
 'a full service luxury highrise': 'aful',
 'actual apt. photos': 'actu',
 'air conditioning': 'airc',
 'all utilities included': 'allu',
 'assigned-parking-space': 'assi',
 'attended lobby': 'atte',
 'backyard': 'back',
 'balcony': 'balc',
 'basement storage': 'base',
 'bike room': 'bike',
 'bike storage': 'bike',
 'billiards room': 'bill',
 'breakfast bar': 'brea',
 'brownstone': 'brow',
 'building-common-outdoor-space': 'buil',
 'business center': 'busi',
 'cable/satellite tv': 'cabl',
 'cats allowed': 'cats',
 'central a/c': 'cent',
 'central ac': 'cent',
 'central air': 'cent',
 'chefs kitchen': 'chef',
 "children's playroom": 'chil',
 'childrens playroom': 'chil',
 'cinema room': 'cine',
 'city view': 'city',
 'close to subway': 'clos',
 'closets galore!': 'clos',
 'common backyard': 'cmba',
 'common garden': 'cmga',
 'common outdoor space': 'cmod',
 'common parking/garage': 'cmpa',
 'common roof deck': 

In [100]:
all_listing_features = {}

for index,row in train_df.iterrows():
    listing_features = {}
    features_found = []
    for feature in row['features']:
        feature = feature.lower()
        if feature in original2key:
            features_found.append(original2key[feature])
    for feature in columns:
            if feature not in features_found:
                listing_features[feature] = 0
            else:
                listing_features[feature] = 1
    all_listing_features[row['listing_id']] = listing_features
    
one_hot_features = pd.DataFrame.from_dict(all_listing_features, orient='index')

In [105]:
one_hot_features

Unnamed: 0,24do,aful,actu,airc,allu,assi,atte,back,balc,base,...,tons,vale,vide,view,virt,walk,wash,whee,wifi,wood
7170325,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7092344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7158677,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7211212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7225292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6824800,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6813268,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6927093,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6892816,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we can try extracting features from the "Description" column

Methodology of: https://www.kaggle.com/ug2409/using-description-as-the-only-feature-72-cv

In [133]:
descriptions = train_df[['description']]

In [134]:
# Removes symbols, numbers and stem the words to reduce dimentional space
stemmer = PorterStemmer()

def clean(x):
    regex = re.compile('[^a-zA-Z ]')
    # For user clarity, broken it into three steps
    i = regex.sub(' ', x).lower()
    i = i.split(" ") 
    i= [stemmer.stem(l) for l in i]
    i= " ".join([l.strip() for l in i if (len(l)>2) ]) # Keeping words that have length greater than 2
    return i

In [135]:
descriptions['description_new'] = descriptions.description.apply(lambda x: clean(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [136]:
descriptions.head()

Unnamed: 0,description,description_new
4,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,spaciou bedroom bathroom williamsburg apart fe...
6,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,brand new gut renov true bedroomfind yourself ...
9,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,flex bedroom with full pressur wall look for t...
10,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,brand new bedroom bath apartmentenjoy these fo...
15,Over-sized Studio w abundant closets. Availabl...,over size studio abund closet avail immedi ren...


In [137]:
from sklearn.feature_extraction.text import CountVectorizer #Can use tfidffvectorizer as well

cvect_desc = CountVectorizer(stop_words='english', max_features=200)
full_sparse = cvect_desc.fit_transform(descriptions.description_new)

# Renaming words to avoid collisions with other feature names in the model
col_desc = ['desc_'+ i for i in cvect_desc.get_feature_names()] 
count_vect_df = pd.DataFrame(full_sparse.todense(), columns=col_desc)
descriptions = pd.concat([descriptions.reset_index(), count_vect_df],axis=1)

In [138]:
descriptions = descriptions.drop(labels=['description', 'index', 'description_new'], axis=1)

Unnamed: 0,desc_access,desc_allow,desc_amaz,desc_amen,desc_amp,desc_ani,desc_apart,desc_applianc,desc_appoint,desc_area,...,desc_walk,desc_wall,desc_washer,desc_water,desc_websit,desc_west,desc_white,desc_window,desc_wood,desc_york
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,5,1,0,0,...,0,1,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,2,1,0,1,...,0,0,1,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [139]:
descriptions.index = train_df['listing_id']
descriptions.head()

Unnamed: 0_level_0,desc_access,desc_allow,desc_amaz,desc_amen,desc_amp,desc_ani,desc_apart,desc_applianc,desc_appoint,desc_area,...,desc_walk,desc_wall,desc_washer,desc_water,desc_websit,desc_west,desc_white,desc_window,desc_wood,desc_york
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7170325,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
7092344,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7158677,0,0,0,0,0,0,5,1,0,0,...,0,1,0,0,1,0,0,0,0,0
7211212,0,0,0,0,0,0,2,1,0,1,...,0,0,1,0,1,0,0,0,0,0
7225292,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
