In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'



In [17]:
train_df = pd.read_json("./input/train.json")
train_df.reset_index(inplace = True)
#test_dr =  pd.read_json("./input/test.json")

## This section generates price features

In [50]:
#create a log transformed version of price
train_df['price_log'] = np.log(train_df['price'])

# create a version of log price variable with very high and very low values clipped
high_limit = np.percentile(train_df.price_log.values, 99) #get 99th percentile
low_limit = np.percentile(train_df.price_log.values, 1) #get 1th percentile
train_df['price_log_cut'] = train_df['price_log']
train_df['price_log_cut'].ix[train_df['price_log']>high_limit] = high_limit
train_df['price_log_cut'].ix[train_df['price_log']<low_limit] = low_limit

#also create a version with normal values - but clip them.
high_limit = np.percentile(train_df.price.values, 99) #get 99th percentile
low_limit = np.percentile(train_df.price.values, 1) #get 1th percentile
train_df['price_cut'] = train_df['price']
train_df['price_cut'].ix[train_df['price']>high_limit] = high_limit
train_df['price_cut'].ix[train_df['price']<low_limit] = low_limit

In [51]:
# create a version of variable that has offset value relative to median
train_df['price_cut_offset'] = train_df['price_cut'] - np.median(train_df['price_cut'])
train_df['price_log_cut_offset'] = train_df['price_log_cut'] - np.median(train_df['price_log_cut'])

In [52]:
features = ["price_cut_offset","price_log_cut_offset"]
with open("./features/simple_price_features", "wb") as f:
    pickle.dump(train_df[features], f, pickle.HIGHEST_PROTOCOL)

# This section looks at the text description

it might be actually useful to see the number of punctuatios (like!!! or ! ) in the text description

In [18]:
import numpy as np
import pandas as pd
#from utils import getTFV, getBOW, dump_feat_name
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle
from copy import copy
from gensim.matutils import corpus2csc
from gensim import corpora, models, similarities
from gensim.models import Phrases
from nltk import bigrams as bgm
import codecs
import os.path
import glob
import os
import ast
import re
from bs4 import BeautifulSoup 

In [19]:
import nltk
#nltk.download()  
from nltk.corpus import stopwords # Import the stop word list

In [20]:
def description_to_words( raw_description ):
    # Function to convert a raw description to string of words
    # The input is a single string (a raw description), and 
    # the output is a single string (a preprocessed description)
    #
    
    # 1. give tags and markup an extra space - otherwise gets squashed together later on
    
    raw_description_1 = re.sub("<",           # The pattern to search for
                          "  <",                   # The pattern to replace it with
                          str(raw_description) )  # The text to search

    raw_description_2 = re.sub(">",           # The pattern to search for
                          "> ",                   # The pattern to replace it with
                          str(raw_description_1) ) 
    
    
    # 2. Remove HTML
    review_desc = BeautifulSoup(raw_description_2,"html.parser").get_text() 
    #

    # 3. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_desc) 
    #
    
    # 4. Convert to lower case, split into individual words
    words = letters_only.lower().split()  
    
    # 5. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 6. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 7. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 

In [24]:
#test_1 = BeautifulSoup(train_df["temp"][10],"html.parser")
test_11 = description_to_words(train_df["description"][2])
print (train_df["description"][2])
print()
print (test_11)

Top Top West Village location, beautiful Pre-war building with laundry in the basement and live in super!<br/><br/>Apartment features a large bedroom with closet. Separate living room, kitchen features granite tops, dishwasher and microwave included, marble bathroom and hardwood flooring. Building is very well maintained and conveniently located near A,C,E,L,1,2,3 trains. Surrounded by many local cafe?s, restaurants, available for November 1st move in!<br/><br/>To view this apartment or any other please contact me via email or call at the number listed.<br/><br/><br/><br/><br/><br/>Bond New York is a real estate broker that supports equal housing opportunity.<p><a  website_redacted 

top top west village location beautiful pre war building laundry basement live super apartment features large bedroom closet separate living room kitchen features granite tops dishwasher microwave included marble bathroom hardwood flooring building well maintained conveniently located near c e l trains sur

In [27]:
print ("Cleaning and parsing the training set movie reviews...\n")
num_descriptions = train_df["description"].size

clean_train_description = []
for i in range(0, num_descriptions):
    # If the index is evenly divisible by 5000, print a message
    if( (i+1)%5000 == 0 ):
        print( "Description %d of %d\n" % ( i+1, num_descriptions ) )                                                                   
    clean_train_description.append( description_to_words( train_df["description"][i] ))

Cleaning and parsing the training set movie reviews...

Review 5000 of 49352

Review 10000 of 49352

Review 15000 of 49352

Review 20000 of 49352

Review 25000 of 49352

Review 30000 of 49352

Review 35000 of 49352



  'Beautiful Soup.' % markup)


Review 40000 of 49352

Review 45000 of 49352



In [45]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 50) 

train_data_features = vectorizer.fit_transform(clean_train_description)
train_data_features = train_data_features.toarray()
train_text_bow_features = pd.DataFrame(train_data_features)
print (train_text_bow_features.shape)

(49352, 50)


In [46]:
vocab = vectorizer.get_feature_names()
print (vocab)

['apartment', 'appliances', 'area', 'bathroom', 'beautiful', 'bedroom', 'building', 'call', 'ceilings', 'city', 'closet', 'com', 'contact', 'dishwasher', 'doorman', 'east', 'email', 'features', 'fee', 'floor', 'floors', 'full', 'granite', 'great', 'hardwood', 'high', 'kagglemanager', 'kitchen', 'large', 'laundry', 'living', 'located', 'location', 'marble', 'new', 'one', 'park', 'private', 'renovated', 'renthop', 'restaurants', 'room', 'space', 'spacious', 'stainless', 'steel', 'text', 'unit', 'windows', 'york']


In [47]:
with open("./features/bow", "wb") as f:
    pickle.dump(train_text_bow_features, f, pickle.HIGHEST_PROTOCOL)