In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'



In [63]:
train_df = pd.read_json("./input/train.json")
test_df = pd.read_json("./input/test.json")
train_df.reset_index(inplace = True)
test_df.reset_index(inplace = True)

## This section generates price features

In [50]:
#create a log transformed version of price
train_df['price_log'] = np.log(train_df['price'])

# create a version of log price variable with very high and very low values clipped
high_limit = np.percentile(train_df.price_log.values, 99) #get 99th percentile
low_limit = np.percentile(train_df.price_log.values, 1) #get 1th percentile
train_df['price_log_cut'] = train_df['price_log']
train_df['price_log_cut'].ix[train_df['price_log']>high_limit] = high_limit
train_df['price_log_cut'].ix[train_df['price_log']<low_limit] = low_limit

#also create a version with normal values - but clip them.
high_limit = np.percentile(train_df.price.values, 99) #get 99th percentile
low_limit = np.percentile(train_df.price.values, 1) #get 1th percentile
train_df['price_cut'] = train_df['price']
train_df['price_cut'].ix[train_df['price']>high_limit] = high_limit
train_df['price_cut'].ix[train_df['price']<low_limit] = low_limit

In [51]:
# create a version of variable that has offset value relative to median
train_df['price_cut_offset'] = train_df['price_cut'] - np.median(train_df['price_cut'])
train_df['price_log_cut_offset'] = train_df['price_log_cut'] - np.median(train_df['price_log_cut'])

In [52]:
features = ["price_cut_offset","price_log_cut_offset"]
with open("./features/simple_price_features", "wb") as f:
    pickle.dump(train_df[features], f, pickle.HIGHEST_PROTOCOL)

# This section looks at the text description

At the moment the feature is only build on test data set - it should be build on the total corpus
There are further things to do here - just conceptual implimentation

It might be actually useful to see the number of punctuatios (like!!! or ! ) in the text description

In [18]:
import numpy as np
import pandas as pd
#from utils import getTFV, getBOW, dump_feat_name
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle
from copy import copy
from gensim.matutils import corpus2csc
from gensim import corpora, models, similarities
from gensim.models import Phrases
from nltk import bigrams as bgm
import codecs
import os.path
import glob
import os
import ast
import re
from bs4 import BeautifulSoup 

In [19]:
import nltk
#nltk.download()  
from nltk.corpus import stopwords # Import the stop word list

In [20]:
def description_to_words( raw_description ):
    # Function to convert a raw description to string of words
    # The input is a single string (a raw description), and 
    # the output is a single string (a preprocessed description)
    #
    
    # 1. give tags and markup an extra space - otherwise gets squashed together later on
    
    raw_description_1 = re.sub("<",           # The pattern to search for
                          "  <",                   # The pattern to replace it with
                          str(raw_description) )  # The text to search

    raw_description_2 = re.sub(">",           # The pattern to search for
                          "> ",                   # The pattern to replace it with
                          str(raw_description_1) ) 
    
    
    # 2. Remove HTML
    review_desc = BeautifulSoup(raw_description_2,"html.parser").get_text() 
    #

    # 3. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_desc) 
    #
    
    # 4. Convert to lower case, split into individual words
    words = letters_only.lower().split()  
    
    # 5. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 6. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 7. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 

In [24]:
#look at how the text description changed before and after cleansing
test_11 = description_to_words(train_df["description"][2])
print (train_df["description"][2])
print()
print (test_11)

Top Top West Village location, beautiful Pre-war building with laundry in the basement and live in super!<br/><br/>Apartment features a large bedroom with closet. Separate living room, kitchen features granite tops, dishwasher and microwave included, marble bathroom and hardwood flooring. Building is very well maintained and conveniently located near A,C,E,L,1,2,3 trains. Surrounded by many local cafe?s, restaurants, available for November 1st move in!<br/><br/>To view this apartment or any other please contact me via email or call at the number listed.<br/><br/><br/><br/><br/><br/>Bond New York is a real estate broker that supports equal housing opportunity.<p><a  website_redacted 

top top west village location beautiful pre war building laundry basement live super apartment features large bedroom closet separate living room kitchen features granite tops dishwasher microwave included marble bathroom hardwood flooring building well maintained conveniently located near c e l trains sur

In [27]:
print ("Cleaning and parsing the training set movie reviews...\n")
num_descriptions = train_df["description"].size

clean_train_description = []
for i in range(0, num_descriptions):
    # If the index is evenly divisible by 5000, print a message
    if( (i+1)%5000 == 0 ):
        print( "Description %d of %d\n" % ( i+1, num_descriptions ) )                                                                   
    clean_train_description.append( description_to_words( train_df["description"][i] ))

Cleaning and parsing the training set movie reviews...

Review 5000 of 49352

Review 10000 of 49352

Review 15000 of 49352

Review 20000 of 49352

Review 25000 of 49352

Review 30000 of 49352

Review 35000 of 49352



  'Beautiful Soup.' % markup)


Review 40000 of 49352

Review 45000 of 49352



In [45]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 50) 

train_data_features = vectorizer.fit_transform(clean_train_description)
train_data_features = train_data_features.toarray()
train_text_bow_features = pd.DataFrame(train_data_features)
print (train_text_bow_features.shape)

(49352, 50)


In [54]:
vocab = vectorizer.get_feature_names()
vocab = ["dscrp_ft_" + s for s in vocab]
print (vocab)

['dscrp_ft_apartment', 'dscrp_ft_appliances', 'dscrp_ft_area', 'dscrp_ft_bathroom', 'dscrp_ft_beautiful', 'dscrp_ft_bedroom', 'dscrp_ft_building', 'dscrp_ft_call', 'dscrp_ft_ceilings', 'dscrp_ft_city', 'dscrp_ft_closet', 'dscrp_ft_com', 'dscrp_ft_contact', 'dscrp_ft_dishwasher', 'dscrp_ft_doorman', 'dscrp_ft_east', 'dscrp_ft_email', 'dscrp_ft_features', 'dscrp_ft_fee', 'dscrp_ft_floor', 'dscrp_ft_floors', 'dscrp_ft_full', 'dscrp_ft_granite', 'dscrp_ft_great', 'dscrp_ft_hardwood', 'dscrp_ft_high', 'dscrp_ft_kagglemanager', 'dscrp_ft_kitchen', 'dscrp_ft_large', 'dscrp_ft_laundry', 'dscrp_ft_living', 'dscrp_ft_located', 'dscrp_ft_location', 'dscrp_ft_marble', 'dscrp_ft_new', 'dscrp_ft_one', 'dscrp_ft_park', 'dscrp_ft_private', 'dscrp_ft_renovated', 'dscrp_ft_renthop', 'dscrp_ft_restaurants', 'dscrp_ft_room', 'dscrp_ft_space', 'dscrp_ft_spacious', 'dscrp_ft_stainless', 'dscrp_ft_steel', 'dscrp_ft_text', 'dscrp_ft_unit', 'dscrp_ft_windows', 'dscrp_ft_york']


In [55]:
train_text_bow_features.colums = vocab

In [61]:
with open("./master/features/AH/bow", "wb") as f:
    pickle.dump(train_text_bow_features, f, pickle.HIGHEST_PROTOCOL)

## date features 

In [76]:
# Let us extract some features like year, month, day, hour from date columns #
train_df['created'] = pd.to_datetime(train_df['created'])
test_df['created'] = pd.to_datetime(test_df['created'])

train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

In [77]:
date_features = ["created_year","created_month","created_day","created_hour"]
# write train features
with open("./master/features/AH/train_date_feat", "wb") as f:
    pickle.dump(train_df[date_features], f, pickle.HIGHEST_PROTOCOL)
    
with open("./master/features/AH/test_day_feat", "wb") as f:
    pickle.dump(test_df[date_features], f, pickle.HIGHEST_PROTOCOL)

## Categorical features

this is the same as here - seems to be used by everyone in some way or another!
https://www.kaggle.com/sudalairajkumar/two-sigma-connect-rental-listing-inquiries/xgb-starter-in-python

We have 4 categorical features in our data

-display_address

-manager_id

-building_id

-listing_id

So let us label encode these features.

In [80]:
from sklearn import preprocessing

categorical = ["display_address", "manager_id", "building_id", "street_address"]

for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))

We have features column which is a list of string values. So we can first combine all the strings together to get a single string and then apply count vectorizer on top of it.

In [81]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
vocab_2 = tfidf.get_feature_names()
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

0                                                     
1    Doorman Elevator Fitness_Center Cats_Allowed D...
2    Laundry_In_Building Dishwasher Hardwood_Floors...
3                               Hardwood_Floors No_Fee
4                                              Pre-War
Name: features, dtype: object


Now let us stack both the dense and sparse features into a single dataset and also get the target variable.

In [83]:
from scipy import sparse

train_df_cat_feat = sparse.hstack([train_df[categorical], tr_sparse]).tocsr()
test_df_cat_feat = sparse.hstack([test_df[categorical], te_sparse]).tocsr()

print(train_df_cat_feat.shape, test_df_cat_feat.shape)

(49352, 204) (74659, 204)


In [115]:
# this creates pandas DataFrame out of the scipy sparse matrix
# probably not the most efficient way of doing things
train_df_cat_feat_pd = pd.DataFrame(train_df_cat_feat.toarray())
test_df_cat_feat_pd = pd.DataFrame(test_df_cat_feat.toarray())

In [125]:
# lost column names along the way - need to add them back
train_df_cat_feat_pd.columns = categorical + vocab_2
test_df_cat_feat_pd.columns = categorical + vocab_2

### Categorical features - manager:

from here: 

https://www.kaggle.com/den3b81/two-sigma-connect-rental-listing-inquiries/improve-perfomances-using-manager-features


This features involvs using target variable in its creation
In the original kernel it ends up being created for each CV - this is the only way to build it as otherwise it will overfit in any cross validation. The actual feature to be used on the test df should be calculated on the train df and them assigned to the same managers.

Probably requires more careful thinking on how to use.

In [None]:
# Let's split the data
X = df[features_to_use]
y = df["interest_level"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

In [100]:
# compute fractions and count for each manager
target_num_map={"high":0, "medium":1, "low":2}
train_df["interest_level_num"]= train_df["interest_level"].apply(lambda x: target_num_map[x]).values

temp = pd.concat([train_df.manager_id, pd.get_dummies(train_df['interest_level_num'])], axis = 1).groupby('manager_id').mean()

temp.columns = ['high_frac','low_frac', 'medium_frac']
temp['count'] = train_df.groupby('manager_id').count().iloc[:,1]

# remember the manager_ids look different because we encoded them in the previous step 
print(temp.tail(10))

            high_frac  low_frac  medium_frac  count
manager_id                                         
4386         0.000000  0.333333     0.666667      3
4387         0.625000  0.125000     0.250000      8
4388         0.000000  0.000000     1.000000      1
4389         0.000000  0.000000     1.000000      1
4390         0.142857  0.142857     0.714286      7
4391         0.000000  0.000000     1.000000      1
4392         0.000000  0.000000     1.000000      7
4394         0.000000  0.000000     1.000000      2
4396         0.142857  0.142857     0.714286      7
4397         0.034783  0.295652     0.669565    115


In [101]:
# compute skill
temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac']

# get ixes for unranked managers...
unranked_managers_ixes = temp['count']<20
# ... and ranked ones
ranked_managers_ixes = ~unranked_managers_ixes

# compute mean values from ranked managers and assign them to unranked ones
mean_values = temp.loc[ranked_managers_ixes, ['high_frac','low_frac', 'medium_frac','manager_skill']].mean()
print(mean_values)
temp.loc[unranked_managers_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values

high_frac        0.081314
low_frac         0.245492
medium_frac      0.673194
manager_skill    0.835822
dtype: float64


In [127]:
# inner join to assign manager features to the managers in the training dataframe
train_df_cat_feat_pd = train_df_cat_feat_pd.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
train_df_cat_feat_pd.head()

Unnamed: 0,display_address,manager_id,building_id,street_address,24,_balconies,_dishwasher_,_dryer,_eat,_elev,...,washer_in_unit,wheelchair_access,wheelchair_ramp,wifi_access,work,high_frac,low_frac,medium_frac,count,manager_skill
0,12282,1568,3797,23484,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.255556,0.744444,90,0.744444
1,9080,1988,8986,23680,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.011628,0.988372,86,0.988372
2,13719,3733,8889,9827,0,0,0,0,0,0,...,0,0,0,0,0,0.059701,0.365672,0.574627,134,0.69403
3,10866,282,1848,14237,0,0,0,0,0,0,...,0,0,0,0,0,0.068063,0.125654,0.806283,191,0.942408
4,15072,2618,0,19227,0,0,0,0,0,0,...,0,0,0,0,0,0.081314,0.245492,0.673194,15,0.835822


In [128]:
# add the features computed on the training dataset to the validation dataset
test_df_cat_feat_pd = test_df_cat_feat_pd.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
new_manager_ixes = test_df_cat_feat_pd['high_frac'].isnull()
test_df_cat_feat_pd.loc[new_manager_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values
test_df_cat_feat_pd.head()

Unnamed: 0,display_address,manager_id,building_id,street_address,24,_balconies,_dishwasher_,_dryer,_eat,_elev,...,washer_in_unit,wheelchair_access,wheelchair_ramp,wifi_access,work,high_frac,low_frac,medium_frac,count,manager_skill
0,13274,3076,5535,24898,0,0,0,0,0,0,...,0,0,0,0,0,0.208333,0.333333,0.458333,24.0,0.875
1,13391,3593,0,5492,0,0,0,0,0,0,...,0,0,0,0,0,0.081314,0.245492,0.673194,9.0,0.835822
2,990,2677,2813,541,0,0,0,0,0,0,...,0,0,0,0,0,0.081314,0.245492,0.673194,1.0,0.835822
3,481,201,5477,10531,0,0,0,0,0,0,...,0,1,0,0,0,0.360656,0.393443,0.245902,61.0,0.967213
4,12317,3157,4428,10907,0,0,0,0,0,0,...,0,0,0,0,0,0.083333,0.319444,0.597222,72.0,0.763889


In [129]:
# write train features
with open("./master/features/AH/train_categ_feat", "wb") as f:
    pickle.dump(train_df_cat_feat_pd, f, pickle.HIGHEST_PROTOCOL)
    
with open("./master/features/AH/test_categ_feat", "wb") as f:
    pickle.dump(test_df_cat_feat_pd, f, pickle.HIGHEST_PROTOCOL)

#### TO DO:

Convert building_ids and manager_ids with only 1 observation into a separate group

### simple features:
price per bed

price per bath

baths per bed

bedrooms/(bedrooms+bathrooms)

### text features
sentiment analysis
