In [35]:
import re
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:

url = 'https://api.graph.cool/simple/v1/cj9inyrks2ssl0130yiv567ps'

def run_query(query):
    response = requests.post(url, json={'query': query})
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception('query failed to returning code of {}. {}'.format(requests.status_code, query))
        
query = """

{
    allListings {
        id
        latitude
        longitude
        description
        airConditioning
        furnished
        generator
        garden
        boysQuarter
        bedrooms
        bathrooms
        masterBedroom
        price
        parkingSpace
        outdoorToilet
        outdoorKitchen
        waterTank
        kitchen
        guestToilet
        swimmingPool
                    locality {
      name
    }
    }
}

"""

result = run_query(query)
df = result['data']['allListings']

In [71]:
locality = [item['locality'] for item in df]

In [74]:
df = pd.DataFrame(df)
df.head(3)

Unnamed: 0,airConditioning,bathrooms,bedrooms,boysQuarter,description,furnished,garden,generator,guestToilet,id,...,latitude,locality,longitude,masterBedroom,outdoorKitchen,outdoorToilet,parkingSpace,price,swimmingPool,waterTank
0,False,2,3,False,3 Bedroom family apartment,False,False,False,False,cj9r9f7tj43kb0146v1y0hpu8,...,13.400521,{u'name': u'Brusubi 1st Phase'},-16.731424,,False,,True,130000.0,False,False
1,False,3,3,True,Fully furnished apartment with 2 boys quarters...,True,True,False,True,cj9sol5i50xup016035eqkr82,...,13.398662,{u'name': u'Brusubi 1st Phase'},-16.732842,,True,,True,400000.0,False,False
2,False,5,4,False,4 Bedroom Family Home at Kerr Serign,False,False,False,False,cj9swdc7b62eo0160cs4xluez,...,,{u'name': u'Senegambia'},,,False,,True,4500000.0,False,False


In [75]:
df.shape

(53, 21)

In [91]:
loc_df = pd.DataFrame(locality, index=np.arange(df.shape[0]))
loc_df.head()

Unnamed: 0,name
0,Brusubi 1st Phase
1,Brusubi 1st Phase
2,Senegambia
3,Brufut
4,Kerr Serign


In [78]:
merged_df = pd.concat([df, loc_df], axis=1, join_axes=[df.index])
merged_df.head(3)

Unnamed: 0,airConditioning,bathrooms,bedrooms,boysQuarter,description,furnished,garden,generator,guestToilet,id,...,locality,longitude,masterBedroom,outdoorKitchen,outdoorToilet,parkingSpace,price,swimmingPool,waterTank,name
0,False,2,3,False,3 Bedroom family apartment,False,False,False,False,cj9r9f7tj43kb0146v1y0hpu8,...,{u'name': u'Brusubi 1st Phase'},-16.731424,,False,,True,130000.0,False,False,Brusubi 1st Phase
1,False,3,3,True,Fully furnished apartment with 2 boys quarters...,True,True,False,True,cj9sol5i50xup016035eqkr82,...,{u'name': u'Brusubi 1st Phase'},-16.732842,,True,,True,400000.0,False,False,Brusubi 1st Phase
2,False,5,4,False,4 Bedroom Family Home at Kerr Serign,False,False,False,False,cj9swdc7b62eo0160cs4xluez,...,{u'name': u'Senegambia'},,,False,,True,4500000.0,False,False,Senegambia


In [47]:
desc_data = df[['id','description']]
desc_data.head(3)

Unnamed: 0,id,description
0,cj9r9f7tj43kb0146v1y0hpu8,3 Bedroom family apartment
1,cj9sol5i50xup016035eqkr82,Fully furnished apartment with 2 boys quarters...
2,cj9swdc7b62eo0160cs4xluez,4 Bedroom Family Home at Kerr Serign


In [48]:
# get some listing descriptions
def print_description(index):
    example = desc_data[desc_data['id'] == index][['description']].values
    if len(example) > 0:
        print(example[0])

In [49]:
print_description('cjfmvdr6183si0193lmrat0wn')

[u'Fully furnished holiday apartments, The property is 15 minutes walk to Senegambia beach. 24 hour watchman security and CCTV recording.  It provides accommodation with free WiFi, air conditioner ,individual living room and bedrooms ,safety deposit box, a flat-screen TV and a kitchenette. Bestway Supermarket in the same Building offering free grocery delivery.']


In [50]:
desc_data['word_count'] = desc_data['description'].apply(lambda x: len(str(x).split()))
desc_lengths = list(desc_data['word_count'])

print('Number of description: ', len(desc_lengths),
     "\nAverage word count", np.average(desc_lengths),
     "\nMinimum word count", min(desc_lengths),
     "\nMaximum word count", max(desc_lengths))

('Number of description: ', 52, '\nAverage word count', 13.692307692307692, '\nMinimum word count', 2, '\nMaximum word count', 52)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [51]:
desc_data.head(3)

Unnamed: 0,id,description,word_count
0,cj9r9f7tj43kb0146v1y0hpu8,3 Bedroom family apartment,4
1,cj9sol5i50xup016035eqkr82,Fully furnished apartment with 2 boys quarters...,29
2,cj9swdc7b62eo0160cs4xluez,4 Bedroom Family Home at Kerr Serign,7


In [52]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]|@,;:.]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

desc_data['desc_clean'] = desc_data['description'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [64]:
clean_desc_len = desc_data['desc_clean'].apply(lambda x: len(str(x).split()))
desc_lengths = list(clean_desc_len)

print('Number of description after removing stopwords: ', len(desc_lengths),
     "\nAverage word count", np.average(desc_lengths),
     "\nMinimum word count", min(desc_lengths),
     "\nMaximum word count", max(desc_lengths))

('Number of description after removing stopwords: ', 52, '\nAverage word count', 10.115384615384615, '\nMinimum word count', 2, '\nMaximum word count', 39)


In [56]:
desc_data.head()

Unnamed: 0,id,description,word_count,desc_clean
0,cj9r9f7tj43kb0146v1y0hpu8,3 Bedroom family apartment,4,3 bedroom family apartment
1,cj9sol5i50xup016035eqkr82,Fully furnished apartment with 2 boys quarters...,29,fully furnished apartment 2 boys quarters outd...
2,cj9swdc7b62eo0160cs4xluez,4 Bedroom Family Home at Kerr Serign,7,4 bedroom family home kerr serign
3,cj9swsow968mg0131kaw103ld,Fully Furnished Story Building at Brufut,6,fully furnished story building brufut
4,cj9twpvt2zjxr0131obez74wf,Fully furnished family house with a boys quarter,8,fully furnished family house boys quarter


In [57]:
desc_data.set_index('id', inplace=True)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(desc_data['desc_clean'])
nearest_neighbor = linear_kernel(tfidf_matrix, tfidf_matrix)

In [59]:
nearest_neighbor[0]

array([1.        , 0.02834846, 0.20095491, 0.        , 0.0438758 ,
       0.05130807, 0.34138789, 0.04519449, 0.0340132 , 0.0340132 ,
       0.04118579, 0.03599456, 0.02195866, 0.07004802, 0.07450596,
       0.        , 0.06220179, 0.02283558, 0.02049754, 0.02927362,
       0.04746582, 0.        , 0.03612593, 0.02776683, 0.02776683,
       0.        , 0.        , 0.11801371, 0.03962135, 0.02412847,
       0.03962135, 0.11931871, 0.03418674, 0.0569805 , 0.        ,
       0.02294021, 0.0465788 , 0.0215954 , 0.04827048, 0.02112997,
       0.06701631, 0.02015945, 0.03049313, 0.04643726, 0.02352901,
       0.05811798, 0.02782429, 0.03890286, 0.02212763, 0.        ,
       0.04295338, 0.03701115])

In [60]:
indices = pd.Series(desc_data.index)

In [80]:
def recommend(id_, nearest_neighbor = nearest_neighbor):
    
    similar_listing_id = []
    
    idx = indices[indices == id_].index[0]
    
    score_series = pd.Series(nearest_neighbor[idx]).sort_values(ascending=False)
    
    top_10_listings = list(score_series.iloc[1:21].index)
    
    for i in top_10_listings:
        similar_listing_id.append(list(desc_data['desc_clean'])[i])
        
    return similar_listing_id


In [81]:
recommend('cj9r9f7tj43kb0146v1y0hpu8')

[u'fully furnished family apartment',
 u'4 bedroom family home kerr serign',
 u'3 bedroom family complex master bedroom boys quarter well borehole beautiful garden solar panel property land dimension 30m x 50m',
 u'beautiful 7 bedroom family complex 3 unfinished bedrooms upstairs separate staircase back possibility make 2 apartments',
 u'beautiful family home 3 bedroom 2 bathrooms parking space',
 u'large family complex apartment 6 master bedrooms',
 u'luxurious apartment 1 bedroom 1 bathroom beautiful garden',
 u'beautiful family complex 3 bedrooms',
 u'fully furnished holiday apartment 1 bedroom swimming pool parking space',
 u'fully furnished family complex 6 bedrooms',
 u'nice family house apartment nice garden generator room',
 u'move gorgeous 2 bedroom 2 bathroom apartment ample street parking safe neighborhood',
 u'fully furnished holiday apartment',
 u'1 bedroom luxury apartment located kotu west air condition wifi parking space',
 u'gorgeous apartment 3 bedrooms',
 u'large fam

1. read in data and set_index to id
2. clean
3. normalize numeric values
4. reduce dimension
5. calculate similarity
6. read in nn with index as id
7. sort values in descending order
8. 