Notebook to Explore Text Features

The data used was created in the regressions notebook under the section dated 4/10/18 i.e. it is three dataframes of listings with average price for each

In [1]:
#starting with listing descriptions

In [2]:
import pandas as pd

In [3]:
listings = pd.read_csv('Datasources/inside_airbnb/listings.csv')

In [4]:
listings.columns

Index([u'id', u'listing_url', u'scrape_id', u'last_scraped', u'name',
       u'summary', u'space', u'description', u'experiences_offered',
       u'neighborhood_overview', u'notes', u'transit', u'access',
       u'interaction', u'house_rules', u'thumbnail_url', u'medium_url',
       u'picture_url', u'xl_picture_url', u'host_id', u'host_url',
       u'host_name', u'host_since', u'host_location', u'host_about',
       u'host_response_time', u'host_response_rate', u'host_acceptance_rate',
       u'host_is_superhost', u'host_thumbnail_url', u'host_picture_url',
       u'host_neighbourhood', u'host_listings_count',
       u'host_total_listings_count', u'host_verifications',
       u'host_has_profile_pic', u'host_identity_verified', u'street',
       u'neighbourhood', u'neighbourhood_cleansed',
       u'neighbourhood_group_cleansed', u'city', u'state', u'zipcode',
       u'market', u'smart_location', u'country_code', u'country', u'latitude',
       u'longitude', u'is_location_exact', u'prope

#Text columns:
space', u'description', u'experiences_offered',
       u'neighborhood_overview', u'notes', u'transit', u'access',
       u'interaction', u'house_rules'

In [5]:
#some basic data exploration to see what can be done

In [6]:
text_features = ['space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules']

In [7]:
len(listings)

6608

In [8]:
#How many rows are null
for i in text_features:
    print("Column Name: %s, Nulls Count: %i," %(i, listings[i].isnull().sum()))

Column Name: space, Nulls Count: 1635,
Column Name: description, Nulls Count: 2,
Column Name: experiences_offered, Nulls Count: 0,
Column Name: neighborhood_overview, Nulls Count: 2471,
Column Name: notes, Nulls Count: 3375,
Column Name: transit, Nulls Count: 2636,
Column Name: access, Nulls Count: 2273,
Column Name: interaction, Nulls Count: 2530,
Column Name: house_rules, Nulls Count: 1642,


In [9]:
listings['description'].head()

0    Aquatica Waterpark, Sleep train Amphitheater, ...
1    Your spacious room awaiting is with a Queen Si...
2    This is an immaculate 3 bedroom, 2 1/2 bath co...
3    This 2 Story TownHome  is close to Otay Ranch ...
4    Hello; we are offering a private secluded bedr...
Name: description, dtype: object

In [10]:
import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [12]:
def vectorize(doc):
    features = defaultdict(int)

    for token in tokenize(doc):
        features[token] += 1

    return features

In [13]:
vectors = map(vectorize, listings['description'].iloc[:1])

In [14]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(listings['description'].iloc[:1])

In [15]:
vectors

<1x42 sparse matrix of type '<type 'numpy.int64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [16]:
tfidf  = TfidfVectorizer()
corpus = tfidf.fit_transform(listings['description'].iloc[:1])

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
"""import unicodedata
from sklearn.base import BaseEstimator, TransformerMixin


class TextNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

    def is_punct(self, token):
        return all(
            unicodedata.category(char).startswith('P') for char in token
        )

    def is_stopword(self, token):
        return token.lower() in self.stopwords"""

"import unicodedata\nfrom sklearn.base import BaseEstimator, TransformerMixin\n\n\nclass TextNormalizer(BaseEstimator, TransformerMixin):\n\n    def __init__(self, language='english'):\n        self.stopwords  = set(nltk.corpus.stopwords.words(language))\n        self.lemmatizer = WordNetLemmatizer()\n\n    def is_punct(self, token):\n        return all(\n            unicodedata.category(char).startswith('P') for char in token\n        )\n\n    def is_stopword(self, token):\n        return token.lower() in self.stopwords"

In [19]:
model = Pipeline([
            #('norm', TextNormalizer()),
            ('tfidf', CountVectorizer(
                                      preprocessor=None, lowercase=False)),
            ('model', LatentDirichletAllocation(n_components=3)),
        ])

In [20]:
model.fit_transform(listings['description'].iloc[:1])



array([[ 0.98735993,  0.00632963,  0.00631044]])

In [21]:
model.steps[-1][1].components_

array([[ 1.26902133,  2.05562914,  2.06017888,  2.83470762,  2.03250305,
         1.23443982,  1.24304619,  1.29663275,  1.23884115,  1.26656166,
         1.26127882,  1.23075161,  1.25366604,  1.23870447,  1.28216689,
         2.04659531,  1.26188375,  1.23180368,  1.2533177 ,  1.2524989 ,
         2.01542106,  1.24265827,  1.25794635,  2.01295537,  1.23331509,
         1.26873006,  1.23358847,  2.03035774,  1.22327871,  1.25909251,
         1.25130389,  2.01172405,  1.23321328,  1.23142629,  2.05194809,
         1.2819665 ,  1.25196099,  1.28441775,  1.26944471,  1.25952438,
         1.27542894,  2.06231565,  1.24849447],
       [ 0.47102121,  0.50494043,  0.44909994,  0.50102125,  0.49621477,
         0.46451952,  0.46609582,  0.4728808 ,  0.46213228,  0.47303494,
         0.45750403,  0.46142482,  0.46763878,  0.48492776,  0.50647068,
         0.4890672 ,  0.47499255,  0.47723054,  0.46807074,  0.49891561,
         0.51459397,  0.49085577,  0.53188074,  0.51154629,  0.4116325 ,
   

In [22]:
lst = []
for i in model.steps[-1][1].components_:
    f=i.argsort()[:-(2 - 1): -1]
    lst.append(f)
    print f
    

[]
[]
[]


In [23]:
lst

[array([], dtype=int64), array([], dtype=int64), array([], dtype=int64)]

In [24]:
##Recreating what I had done before losing everything

In [25]:
listings['description'].dtypes

dtype('O')

In [26]:
listings['description'].isnull().sum()

2

In [27]:
corpus = listings['description'].fillna('none')

In [28]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

In [29]:
data_vectorized = vectorizer.fit_transform(corpus)

In [30]:
from sklearn.decomposition import LatentDirichletAllocation

In [32]:
lda_model = LatentDirichletAllocation(n_topics=20,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=20, perp_tol=0.1,
             random_state=100, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [33]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
print(lda_model.get_params())

('Log Likelihood: ', -3210132.0502824672)
('Perplexity: ', 851.78520451912868)
{'learning_offset': 10.0, 'n_jobs': -1, 'topic_word_prior': None, 'perp_tol': 0.1, 'evaluate_every': -1, 'max_iter': 10, 'mean_change_tol': 0.001, 'batch_size': 128, 'max_doc_update_iter': 100, 'learning_decay': 0.7, 'n_components': 10, 'random_state': 100, 'doc_topic_prior': None, 'n_topics': 20, 'total_samples': 1000000.0, 'learning_method': 'online', 'verbose': 0}


In [34]:
"""# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)"""

"# Define Search Param\nsearch_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}\n\n# Init the Model\nlda = LatentDirichletAllocation()\n\n# Init Grid Search Class\nmodel = GridSearchCV(lda, param_grid=search_params)\n\n# Do the Grid Search\nmodel.fit(data_vectorized)"

In [35]:
"""# Best Model - using original lda_mdoel for now
best_lda_model = model.best_estimator_
best_lda_model = lda_model
# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))"""

'# Best Model - using original lda_mdoel for now\nbest_lda_model = model.best_estimator_\nbest_lda_model = lda_model\n# Model Parameters\nprint("Best Model\'s Params: ", model.best_params_)\n\n# Log Likelihood Score\nprint("Best Log Likelihood Score: ", model.best_score_)\n\n# Perplexity\nprint("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))'

In [36]:
import numpy as np

In [37]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(corpus))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [38]:
df_document_topic.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic
Doc0,0.0,0.0,0.0,0.86,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,3
Doc1,0.07,0.0,0.0,0.32,0.0,0.17,0.0,0.0,0.08,0.16,...,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.03,3
Doc2,0.0,0.0,0.31,0.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
Doc3,0.0,0.0,0.04,0.8,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.0,0.0,3
Doc4,0.0,0.0,0.01,0.6,0.01,0.16,0.02,0.02,0.0,0.07,...,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.01,0.0,3


In [39]:
df_document_topic.index = [i for i in range(len(df_document_topic))]

In [40]:
out = df_document_topic.merge(listings, left_index=True, right_index=True)

In [None]:
out.head()

Now adding some more basic features created on text:

In [53]:
import nltk
from nltk.corpus import stopwords
import re

def create_txt_features(pdseries):

    textLength = []
    textWordsPerc = []
    textPuncPerc = []
    textDigitsPerc = []

    for i in pdseries:
        tokens = re.findall(r"[\w']+|[.,!?;]", i)
        textLength.append(len(tokens))

        if len(tokens)==0:
            textWordsPerc.append(0)
            textPuncPerc.append(0)
            textDigitsPerc.append(0)

        else:
            textWordsPerc.append(len(i.split())/float(len(tokens)))
            textPuncPerc.append(len(''.join(c for c in i if c in string.punctuation))/float(len(tokens)))
            textDigitsPerc.append(len(''.join(c for c in i if c in string.digits))/float(len(tokens)))

    return textLength, textWordsPerc, textPuncPerc, textDigitsPerc

In [69]:
def lexical_diversity(text):
    return len(text)/len(set(text))

In [70]:
"""def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content)/len(text)"""

In [58]:
for i in text_features:
    out[i]=out[i].fillna('None')

In [60]:
text_features

['space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules']

In [71]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['space'])
out['space_textLength'] = textLength
out['space_textWordsPerc'] = textWordsPerc
out['space_textPuncPerc'] = textPuncPerc
out['space_textDigitsPerc'] = textDigitsPerc
out['space_diversity'] = pd.Series([lexical_diversity(i) for i in out['space']])

In [72]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['description'])
out['description_textLength'] = textLength
out['description_textWordsPerc'] = textWordsPerc
out['description_textPuncPerc'] = textPuncPerc
out['description_textDigitsPerc'] = textDigitsPerc
out['description_diversity'] = pd.Series([lexical_diversity(i) for i in out['description']])

In [73]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['experiences_offered'])
out['experiences_offered_textLength'] = textLength
out['experiences_offered_textWordsPerc'] = textWordsPerc
out['experiences_offered_textPuncPerc'] = textPuncPerc
out['experiences_offered_textDigitsPerc'] = textDigitsPerc
out['experiences_offered_diversity'] = pd.Series([lexical_diversity(i) for i in out['experiences_offered']])

In [74]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['neighborhood_overview'])
out['neighborhood_overview_textLength'] = textLength
out['neighborhood_overview_textWordsPerc'] = textWordsPerc
out['neighborhood_overview_textPuncPerc'] = textPuncPerc
out['neighborhood_overview_textDigitsPerc'] = textDigitsPerc
out['neighborhood_overview_diversity'] = pd.Series([lexical_diversity(i) for i in out['neighborhood_overview']])

In [75]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['notes'])
out['notes_textLength'] = textLength
out['notes_textWordsPerc'] = textWordsPerc
out['notes_textPuncPerc'] = textPuncPerc
out['notes_textDigitsPerc'] = textDigitsPerc
out['notes_diversity'] = pd.Series([lexical_diversity(i) for i in out['notes']])

In [76]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['transit'])
out['transit_textLength'] = textLength
out['transit_textWordsPerc'] = textWordsPerc
out['transit_textPuncPerc'] = textPuncPerc
out['transit_textDigitsPerc'] = textDigitsPerc
out['transit_diversity'] = pd.Series([lexical_diversity(i) for i in out['transit']])

In [77]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['access'])
out['access_textLength'] = textLength
out['access_textWordsPerc'] = textWordsPerc
out['access_textPuncPerc'] = textPuncPerc
out['access_textDigitsPerc'] = textDigitsPerc
out['access_diversity'] = pd.Series([lexical_diversity(i) for i in out['access']])

In [78]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['interaction'])
out['interaction_textLength'] = textLength
out['interaction_textWordsPerc'] = textWordsPerc
out['interaction_textPuncPerc'] = textPuncPerc
out['interaction_textDigitsPerc'] = textDigitsPerc
out['interaction_diversity'] = pd.Series([lexical_diversity(i) for i in out['interaction']])

In [79]:
textLength, textWordsPerc, textPuncPerc, textDigitsPerc = create_txt_features(out['house_rules'])
out['house_rules_textLength'] = textLength
out['house_rules_textWordsPerc'] = textWordsPerc
out['house_rules_textPuncPerc'] = textPuncPerc
out['house_rules_textDigitsPerc'] = textDigitsPerc
out['house_rules_diversity'] = pd.Series([lexical_diversity(i) for i in out['house_rules']])

In [80]:
out.to_csv('listings_withtopics.csv')