In [178]:
import re
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
from wombat.models import dbsession, engine, Item
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

en_stop = get_stop_words('en')

#res = engine.execute(canonical_query).fetchall()
items = dbsession.query(Item).filter(Item.item_type == 'dresses')\
        .filter(Item.brand != 'LENDER SUBMISSION FILL IN').filter(Item.title != 'LENDER SUBMISSION FILL IN').all()

info_blobs = []
for item in items:
    info_blobs.append([item.title, item.description])

In [179]:
def get_first_sentence(string):
    try:
        return re.split(r'(?<=[.:;])\s', string)[0]
    except TypeError:
        return ''

raw_docs = []
for i, t in enumerate(info_blobs):
    first_sentence = get_first_sentence(t[1])
    item_title = info_blobs[i][0]
    combined = ' '.join([item_title, first_sentence])
    raw_docs.append(combined.lower())

In [208]:
def clean_doc(string):
    
    # tokenize document string
    tokens = tokenizer.tokenize(string)
    
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    p_stemmer = PorterStemmer()
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    combined_tokens = ' '.join(stemmed_tokens)
    
    return combined_tokens

cleaned_docs = [clean_doc(doc) for doc in raw_docs]

input_string = 'blue blossom tibi dress'
cleaned_docs.append(input_string)
print(cleaned_docs[-2])
print(cleaned_docs[-1])
cleaned_docs

blue blossom tibi dress fit beauti perfect outdoor summer cocktail parti
blue blossom tibi dress


['ava guipur lace mini dress self portrait s ava mini dress perfect parti season team heel layer tight weather cool',
 'lurex dress tibi detail metal gold fil coup flow maxi dress bare shoulder halter neck fall loos pleat ruffl hem',
 'diamond back dress beauti navi blue lace dress diamond cutout back',
 'sparkl cocktail dress flatter scoop neck bodycon dress glisten gold beig sequin',
 'long v neck bead gown gunmet floor sweep gown adrianna papel featur v neck front back finish allov sequin bead detail sophist sparkl',
 'lace backout gown dress perfect next formal occas s lace dream back cutout back slit make easi move',
 'pink kathlin argiro cocktail dress beauti kathlin argiro dress 98 polyst pink',
 'valentina mini dress inspir travel europ mini design keep stylish bell sleev detail ad hint sex appeal lace detail',
 'jetset diari vera maxi dress maxi silhouett',
 'jetset diari zulu maxi dress maxi silhouett featur crisscross detail bodic',
 'jetset diari cirru maxi dress maxi silho

In [207]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_docs)

similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix)

print(tfidf_matrix)

results_num = 20

# top_d is a dictionary of (index: similarity_ranking) values
top_d = {}
    
for i, s in enumerate(similarities[0]):
    # the first one will always be the actual item itself so if we want 20 items 
    # we ask for 21 hence the + 1
    if len(top_d) <= results_num:
        top_d[i] = s
    if (min(top_d.values()) < s) and (len(top_d) > results_num + 1):
       top_d[i] = s
       top_d.pop(min(top_d, key=top_d.get), None)

top_d = { k:v for k, v in top_d.items() if v < 1 }

print(top_d)

# sort the top values into a descending list
ranked_similarities = []

while len(top_d) > 0:
    ranked_similarities.append((max(top_d, key=top_d.get), max(top_d.values())))
    top_d.pop(max(top_d, key=top_d.get))
    
relevant_items = []
for i in ranked_similarities:
    relevant_items.append(items[i[0]])

print(relevant_items)
    
#prices = []
#for item in relevant_items:
    #print(item.rent_per_week)

  (0, 357)	0.537300929099
  (0, 1906)	0.240901757722
  (0, 2349)	0.114269860505
  (0, 2773)	0.280150713753
  (0, 1304)	0.0962187651985
  (0, 3701)	0.21516090228
  (0, 3269)	0.226501617667
  (0, 3157)	0.128591901258
  (0, 3115)	0.150458077493
  (0, 3688)	0.235457461125
  (0, 4198)	0.309448755162
  (0, 1961)	0.226501617667
  (0, 2407)	0.189911924287
  (0, 4265)	0.249680339724
  (0, 4594)	0.264737774895
  (0, 1013)	0.211444182496
  (1, 1304)	0.109239077907
  (1, 2565)	0.300562061433
  (1, 4259)	0.292875116295
  (1, 1223)	0.144823193017
  (1, 2733)	0.190517678002
  (1, 1854)	0.194073624936
  (1, 1597)	0.351323326626
  (1, 1042)	0.351323326626
  (1, 1660)	0.243181287145
  :	:
  (4585, 3726)	0.179749513118
  (4585, 3698)	0.259782860603
  (4585, 187)	0.27494628426
  (4585, 327)	0.187911673609
  (4585, 1011)	0.251437826376
  (4585, 2188)	0.166853305077
  (4585, 3606)	0.279940038438
  (4585, 2398)	0.300868328216
  (4585, 2682)	0.233860816646
  (4585, 4707)	0.326790372173
  (4586, 1304)	0.080771

In [164]:

        
#print(string)
#print(get_first_sentence(''))