# Clustering Merch Data Shirts
1. read in data file and create large list of indexed data
2. remove texts that pop up in all documents (i.e. stopwords / phrases)
3. clean the documents
4. stem and tokenize the words in each document
5. run all documents through tf-idf vectorizer

To Do:
- read in datafiles better (use different delimiter?)
- clean up input documents better
- possibly create my own tfidf vectorizor that works as a sum rather than a median


## Initialize steps

First we import the necessary libraries.

In [47]:
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

Import the stemmer and then tokenize and stem the text.

In [48]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if not re.search('[^a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if not re.search('[^a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens
print(tokenize_and_stem('artwork'))
# totalvocab_stemmed = []
# totalvocab_tokenized = []
# for i in documents:
#     allwords_stemmed = tokenize_and_stem(i)
#     totalvocab_stemmed.extend(allwords_stemmed)
    
#     allwords_tokenized = tokenize_only(i)
#     totalvocab_tokenized.extend(allwords_tokenized)
    
# vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

['artwork']


In [49]:
print(tokenize_and_stem('retro'))

['retro']


Now we import the shirt data file and create documents from it. Each document is the title of the shirt followed by it's description.

In [50]:
def clean_string(document):
    document = document.lower()
    default_1 = "Lightweight, Classic fit, Double-needle sleeve and bottom hem".lower()
    default_2 = "Solid colors: 100% Cotton; Heather Grey: 90% Cotton, 10% Polyester; All Other Heathers: 50% Cotton, 50% Polyester Imported Machine wash cold with like colors, dry low heat ".lower()
    document = document.replace(default_1, '').replace(default_2,'')
    
    document.replace("tee shirt","tshirt")
    document.replace("t-shirt", 'tshirt')
    document.replace(' t shirt', 'tshirt')
    document.replace("-", " ")
    
    document.replace("shirt", "")
    document.replace("tshirt", "")
    
    document.replace("officially licensed","")
    
    if re.compile("officially licensed [\w\s]+ (apparel|shirt)").search(document):
        document = re.sub(r'officially licensed [\w\s]+ (apparel|shirt)', '', document)
    
    if re.compile("official [\w\s]+merchandise").search(document):
        document = re.sub(r'official [\w\s]+merchandise', '', document)

    if re.compile("graphic [\w\s\-]+shirt").search(document):
        document = re.sub(r'graphic [\w\s\-]+shirt', '', document)
    
    
    document = BeautifulSoup(document, 'html.parser').getText()
    
    return document

document_data_dict = {}
def read_shirt_data_file(path):
    document_data = []
    with open(path, 'r') as data_file:
        for line in data_file:
            data = {}
            if len(line.replace("\"","").split('|')) == 11:
                for item in line.replace("\"","").split('|'):
                    data[item.split(':',1)[0]] = item.split(':',1)[1]
                data['document'] = clean_string(data['title']) # + ". " + data['description'])
    #             line = line.split('|',10)
    #             new_line = []
    #             for item in line:
    #                 print(item)
    #                 new_line.append(item.split(':',1)[1])
    #             line = [item.split(':',1)[1] for item in line]
    #             line.append(clean_string(line[2] + ". " + line[10]))
                if data['asin'] not in document_data_dict:
                    document_data_dict[data['asin']] = data
                    document_data.append(data)
                
    # [salesRank (0), asin(1), title(2), imageUrl(3), trademarked(4), isMerch(5), date(6), unix(7), errorMessage(8), link(9), description(10), document(11)]
    return document_data

document_data = read_shirt_data_file("shirts_newest_nt") # read_shirt_data_file("shirts_featured_nt")
print(document_data[0])
print(len(document_data))

{'salesRank': 'NA', 'asin': 'B07X97D31R', 'title': 'Support Cancer Shirt Prostate Cancer Awareness Tshirt', 'imgUrl': 'https://m.media-amazon.com/images/I/A13usaonutL._CLa%7C2140%2C2000%7C918G0EUHpmL.png%7C0%2C0%2C2140%2C2000%2B0.0%2C0.0%2C2140.0%2C2000.0._UX342_.png', 'trademarked': 'False', 'isMerch': 'True', 'date': 'August 30, 2019', 'unix': '1567123200', 'errorMessage': '[]', 'link': 'https://www.amazon.com/dp/B07X97D31R', 'description': 'Solid colors: 100% Cotton; Heather Grey: 90% Cotton, 10% Polyester; All Other Heathers: 50% Cotton, 50% Polyester Imported Machine wash cold with like colors, dry low heat Are you a Fighter who has or is fighting Prostate Cancer? This Motivational T-shirt is perfect for you. Great shirt for your Light Blue ribbon events, Hospital Visits or To support a loved one with Cancer. This Prostate Cancer Tshirt is a great Birthday or Christmas Gift For Surviors. Show your love with a Light Blue Ribbon. Lightweight, Classic fit, Double-needle sleeve and bo

Clean the documents to make them unicode.

In [51]:
# def clean_documents(documents):
#     documents_clean = []
#     for text in documents:
#         text = BeautifulSoup(text, 'html.parser').getText()
#         documents_clean.append(text)

#     return documents_clean

# documents = clean_documents(documents)

In [52]:
# print(documents)

## Option 1: 
cosine similarity of stems and ngrams (Tf-idf and document similarity)

### Create tfidf vector from documents

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

with open('stop_words.txt','r') as f:
    extra_stop_words = f.read().split('\n')

print(extra_stop_words)

my_stop_words = text.ENGLISH_STOP_WORDS.union(extra_stop_words)
#print(my_stop_words)

tfidf_vectorizer = TfidfVectorizer(max_df=0.02, max_features=10000000,
                                 min_df=0, stop_words=my_stop_words,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform([document['document'] for document in document_data])
#print(tfidf_matrix[0])
print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()
print(len(terms))
#print(stemmed_documents[99])

print(terms[247])



['logo', 'shirt', 'tshirt', 'vintag', 'retro', 'movie', 'officially', 'licensed', 'official', 'offici', 'licens', 'graphic', 'artwork', 'gift', 'men', 'women', '']


  'stop_words.' % sorted(inconsistent))


CPU times: user 4.82 s, sys: 38.1 ms, total: 4.86 s
Wall time: 4.84 s
(16010, 74640)
74640
adjust


In [54]:
# Testing cosine similarity comparisons

from sklearn.metrics.pairwise import cosine_similarity
#print(cosine_similarity(tfidf_matrix))
# print(tfidf_matrix[0])
# print(tfidf_matrix[1])
# print(tfidf_matrix[2])

start = 0
length = 1
dist = cosine_similarity(tfidf_matrix[start:start+length],tfidf_matrix)
print(dist[0])
for doc_vector_index, doc_vector in enumerate(dist):
    print("checking {} for similarities...".format(document_data[start + doc_vector_index]['asin']))
    print("{}".format(document_data[start + doc_vector_index]['document']))
    for similarity_index, doc_similarity in enumerate(doc_vector):
        if doc_similarity > .2:
            print("{} {}".format(doc_similarity, document_data[similarity_index]['document']))
        

# print(tfidf_matrix.shape)
# print(dist)

[1. 0. 0. ... 0. 0. 0.]
checking B07X97D31R for similarities...
support cancer shirt prostate cancer awareness tshirt
1.0 support cancer shirt prostate cancer awareness tshirt
0.22020866476987808 breast cancer awareness support tee
0.2165820341351916 strength against cancer vintage childhood cancer awareness t-shirt
0.21020493217342937 prostate cancer sucks dinosaur trex blue ribbon awareness
0.2140131736540844 breast cancer awareness shirt breast cancer shirts for women t-shirt
0.3377413328599886 i wear light blue for my dad prostate cancer awareness shirt
0.35034916405125105 his fight is my fight i prostate cancer awareness fight gift t-shirt
0.21563231640052763 fuck cancer tshirt awareness for cancer survivor gifts t-shirt
0.3755102199922954 i wear blue for my uncle prostate cancer awareness shirt
0.42907667964294915 wolf still here still fighting prostate cancer awareness t-shirt


### Compute cosine similarity between all doc vectors, and create niches

In [55]:
from sklearn.metrics.pairwise import cosine_similarity
import pprint
pp = pprint.PrettyPrinter(indent=4)

niches = {}

dist = cosine_similarity(tfidf_matrix)

for doc_vector_index, doc_vector in enumerate(dist):
#     if doc_vector_index > 100:
#         break

    niches[doc_vector_index] = {}
    niches[doc_vector_index]['similar_docs'] = set()
    niches[doc_vector_index]['percent_sales_ranks'] = 0.0
    niches[doc_vector_index]['average_sales_rank'] = 0
    niches[doc_vector_index]['hot'] = False
    niches[doc_vector_index]['consumed'] = False
    for similar_doc_index, similar_doc in enumerate(doc_vector):
        if similar_doc > .2:
            niches[doc_vector_index]['similar_docs'].add(similar_doc_index)
            # if doc_vector_index != similarity_index and similarity_index in niches:
                
    # if float(len(niches[doc_vector_index].intersection(niches[similarity_index])))/float(min(len(niches[similarity_index]),len(niches[doc_vector_index]))) > 0.75

    for similar_doc in niches[doc_vector_index]['similar_docs']:
        if doc_vector_index != similar_doc and similar_doc in niches and not niches[similar_doc]['consumed']:

            intersecting_docs = niches[doc_vector_index]['similar_docs'].intersection(niches[similar_doc]['similar_docs'])
            smaller_niche = min(len(niches[similar_doc]['similar_docs']),len(niches[doc_vector_index]['similar_docs']))                                                                          

            if (float(len(intersecting_docs))/float(smaller_niche)) > 0.75:
#                 print("these two share more than 75% of items")
#                 print(float(len(niches[doc_vector_index]['similar_docs'])))
#                 print(doc_vector_index)
#                 print(niches[doc_vector_index]['similar_docs'])
#                 print(document_data[doc_vector_index]['document'])
#                 print(float(len(niches[similar_doc])))
#                 print(similar_doc)
#                 print(niches[similar_doc]['similar_docs'])
#                 print(document_data[similar_doc]['document'])
#                 for item in niches[similar_doc]:
#                     print(item)
#                     print(document_data[item]['document'])
                if len(niches[doc_vector_index]['similar_docs']) < len(niches[similar_doc]['similar_docs']):
                    print("consuming {}".format(doc_vector_index))
                    niches[doc_vector_index]['consumed'] = True
                else:
                    print("consuming {}".format(similar_doc))
                    niches[similar_doc]['consumed'] = True
                
                    

# pp.pprint(niches)

    
            
            


consuming 6
consuming 26
consuming 35
consuming 41
consuming 23
consuming 38
consuming 57
consuming 48
consuming 63
consuming 60
consuming 69
consuming 58
consuming 70
consuming 80
consuming 85
consuming 3
consuming 55
consuming 51
consuming 62
consuming 95
consuming 99
consuming 107
consuming 110
consuming 82
consuming 66
consuming 124
consuming 118
consuming 119
consuming 130
consuming 121
consuming 125
consuming 89
consuming 135
consuming 91
consuming 54
consuming 157
consuming 134
consuming 158
consuming 162
consuming 15
consuming 167
consuming 147
consuming 169
consuming 155
consuming 166
consuming 174
consuming 176
consuming 177
consuming 178
consuming 168
consuming 182
consuming 190
consuming 194
consuming 161
consuming 209
consuming 212
consuming 214
consuming 215
consuming 217
consuming 198
consuming 228
consuming 233
consuming 237
consuming 165
consuming 252
consuming 164
consuming 257
consuming 259
consuming 239
consuming 264
consuming 242
consuming 271
consuming 262
consumi

consuming 1254
consuming 910
consuming 1419
consuming 1423
consuming 1206
consuming 1427
consuming 1428
consuming 1430
consuming 1431
consuming 1158
consuming 923
consuming 884
consuming 1443
consuming 1445
consuming 1447
consuming 369
consuming 1376
consuming 1453
consuming 78
consuming 1455
consuming 1456
consuming 1197
consuming 1462
consuming 625
consuming 1426
consuming 1465
consuming 874
consuming 1468
consuming 467
consuming 1469
consuming 365
consuming 1475
consuming 1476
consuming 1479
consuming 1480
consuming 1337
consuming 1401
consuming 1485
consuming 1486
consuming 1492
consuming 1494
consuming 1446
consuming 1139
consuming 1501
consuming 797
consuming 1208
consuming 1507
consuming 1508
consuming 1509
consuming 1512
consuming 1513
consuming 1515
consuming 1141
consuming 1517
consuming 1518
consuming 1523
consuming 1523
consuming 204
consuming 1525
consuming 1527
consuming 1383
consuming 1529
consuming 1538
consuming 1170
consuming 1488
consuming 1544
consuming 1545
consumi

consuming 2400
consuming 2409
consuming 391
consuming 2088
consuming 1130
consuming 2318
consuming 2416
consuming 2417
consuming 2417
consuming 2417
consuming 2418
consuming 2421
consuming 2423
consuming 2423
consuming 2306
consuming 129
consuming 2432
consuming 2433
consuming 2434
consuming 2435
consuming 2438
consuming 2350
consuming 1305
consuming 2371
consuming 2445
consuming 2446
consuming 1328
consuming 2448
consuming 265
consuming 2450
consuming 1156
consuming 1696
consuming 2457
consuming 2460
consuming 2461
consuming 2463
consuming 1651
consuming 2465
consuming 2466
consuming 2468
consuming 2468
consuming 2472
consuming 2475
consuming 1415
consuming 2477
consuming 2478
consuming 2424
consuming 2325
consuming 2483
consuming 2331
consuming 1939
consuming 532
consuming 488
consuming 2304
consuming 2491
consuming 2492
consuming 2395
consuming 2498
consuming 2481
consuming 2420
consuming 2506
consuming 2507
consuming 2509
consuming 2510
consuming 2510
consuming 2516
consuming 2521


consuming 3300
consuming 2176
consuming 3306
consuming 1296
consuming 3308
consuming 3309
consuming 2612
consuming 3312
consuming 3130
consuming 1759
consuming 3320
consuming 3321
consuming 3322
consuming 3282
consuming 3324
consuming 3326
consuming 645
consuming 3207
consuming 3329
consuming 3330
consuming 3331
consuming 3332
consuming 755
consuming 3333
consuming 3334
consuming 2112
consuming 3336
consuming 3338
consuming 3338
consuming 3338
consuming 3249
consuming 3342
consuming 1129
consuming 3345
consuming 3347
consuming 3348
consuming 2459
consuming 355
consuming 2484
consuming 3358
consuming 3358
consuming 2356
consuming 3361
consuming 2824
consuming 3135
consuming 3363
consuming 3364
consuming 3365
consuming 820
consuming 1297
consuming 3370
consuming 3370
consuming 3261
consuming 3241
consuming 3374
consuming 3377
consuming 3379
consuming 3380
consuming 3386
consuming 457
consuming 3389
consuming 3392
consuming 1791
consuming 3394
consuming 3395
consuming 3395
consuming 3396


consuming 2627
consuming 4126
consuming 4129
consuming 4130
consuming 4131
consuming 3931
consuming 4135
consuming 4137
consuming 4137
consuming 3908
consuming 4139
consuming 4139
consuming 3166
consuming 4141
consuming 4142
consuming 4144
consuming 4145
consuming 4145
consuming 4145
consuming 4146
consuming 4149
consuming 4069
consuming 3154
consuming 1720
consuming 4158
consuming 4161
consuming 4162
consuming 4164
consuming 4165
consuming 1690
consuming 4169
consuming 4173
consuming 4175
consuming 3879
consuming 4183
consuming 4185
consuming 3430
consuming 4052
consuming 4189
consuming 4184
consuming 4192
consuming 4193
consuming 1972
consuming 4035
consuming 4196
consuming 3606
consuming 4202
consuming 4204
consuming 4204
consuming 2296
consuming 4206
consuming 4207
consuming 4207
consuming 4208
consuming 4209
consuming 4209
consuming 4210
consuming 2487
consuming 4072
consuming 4213
consuming 453
consuming 2473
consuming 4217
consuming 2698
consuming 4219
consuming 4224
consuming 4

consuming 3894
consuming 3304
consuming 4824
consuming 1764
consuming 4825
consuming 4825
consuming 4825
consuming 4535
consuming 4830
consuming 4473
consuming 4832
consuming 4250
consuming 4835
consuming 4835
consuming 4837
consuming 4838
consuming 2523
consuming 4839
consuming 4840
consuming 3240
consuming 245
consuming 4843
consuming 4844
consuming 4845
consuming 4845
consuming 2119
consuming 4761
consuming 4593
consuming 4852
consuming 4853
consuming 4725
consuming 4859
consuming 4138
consuming 4861
consuming 4095
consuming 4862
consuming 4862
consuming 4863
consuming 4635
consuming 4865
consuming 4866
consuming 4870
consuming 4093
consuming 4871
consuming 4667
consuming 4407
consuming 4876
consuming 4877
consuming 4878
consuming 4557
consuming 4883
consuming 4884
consuming 4885
consuming 4886
consuming 4887
consuming 4888
consuming 4642
consuming 4857
consuming 4893
consuming 4894
consuming 4896
consuming 3740
consuming 4897
consuming 4899
consuming 4901
consuming 4901
consuming 4

consuming 5588
consuming 5591
consuming 5072
consuming 5543
consuming 5596
consuming 5597
consuming 5598
consuming 5600
consuming 5602
consuming 5602
consuming 5602
consuming 5603
consuming 5604
consuming 5605
consuming 5605
consuming 5605
consuming 5607
consuming 5384
consuming 5472
consuming 520
consuming 5611
consuming 5611
consuming 5611
consuming 5612
consuming 5612
consuming 5612
consuming 5613
consuming 5614
consuming 5614
consuming 5615
consuming 1233
consuming 1072
consuming 5619
consuming 5620
consuming 5620
consuming 5620
consuming 5381
consuming 5094
consuming 5623
consuming 5340
consuming 5373
consuming 5626
consuming 5627
consuming 2349
consuming 5375
consuming 5471
consuming 5638
consuming 5639
consuming 5595
consuming 5642
consuming 5229
consuming 5644
consuming 5645
consuming 3229
consuming 2753
consuming 5474
consuming 2151
consuming 5066
consuming 5652
consuming 5104
consuming 5654
consuming 5592
consuming 5656
consuming 5658
consuming 2235
consuming 5660
consuming 5

consuming 6229
consuming 6229
consuming 6229
consuming 6230
consuming 6231
consuming 6232
consuming 6234
consuming 6235
consuming 5481
consuming 6240
consuming 6241
consuming 6242
consuming 6243
consuming 4978
consuming 6246
consuming 6131
consuming 6097
consuming 6252
consuming 6199
consuming 6255
consuming 6256
consuming 6257
consuming 6250
consuming 5772
consuming 525
consuming 6262
consuming 6218
consuming 3427
consuming 1889
consuming 6270
consuming 6271
consuming 6274
consuming 3820
consuming 5839
consuming 6278
consuming 6041
consuming 5841
consuming 6140
consuming 6283
consuming 6283
consuming 6283
consuming 5236
consuming 5958
consuming 6289
consuming 6291
consuming 6292
consuming 6293
consuming 6295
consuming 6296
consuming 6298
consuming 6298
consuming 6299
consuming 6301
consuming 6301
consuming 6301
consuming 6302
consuming 5683
consuming 5894
consuming 6306
consuming 6307
consuming 5123
consuming 6310
consuming 4927
consuming 6312
consuming 6312
consuming 6314
consuming 6

consuming 6947
consuming 6948
consuming 6764
consuming 6953
consuming 6957
consuming 6957
consuming 6958
consuming 6759
consuming 4284
consuming 6966
consuming 6967
consuming 6968
consuming 6969
consuming 6970
consuming 6972
consuming 6973
consuming 6974
consuming 6976
consuming 6978
consuming 6978
consuming 6979
consuming 6856
consuming 6982
consuming 6752
consuming 6983
consuming 6987
consuming 6988
consuming 6992
consuming 6993
consuming 4492
consuming 6996
consuming 6996
consuming 6996
consuming 6999
consuming 6439
consuming 6840
consuming 7004
consuming 7005
consuming 7006
consuming 6869
consuming 7009
consuming 6376
consuming 191
consuming 7011
consuming 7012
consuming 7012
consuming 7012
consuming 7013
consuming 7014
consuming 7015
consuming 7016
consuming 6490
consuming 6921
consuming 5157
consuming 7025
consuming 7025
consuming 7026
consuming 2167
consuming 4798
consuming 7031
consuming 6522
consuming 7033
consuming 7035
consuming 5675
consuming 7044
consuming 5618
consuming 5

consuming 7698
consuming 7698
consuming 7698
consuming 7699
consuming 7700
consuming 7700
consuming 7407
consuming 7701
consuming 7703
consuming 7704
consuming 7705
consuming 6891
consuming 7539
consuming 7710
consuming 7710
consuming 7710
consuming 7711
consuming 7713
consuming 7714
consuming 7715
consuming 7657
consuming 5647
consuming 7722
consuming 7724
consuming 7232
consuming 7727
consuming 7729
consuming 4875
consuming 7731
consuming 7732
consuming 7734
consuming 7735
consuming 7736
consuming 7737
consuming 7738
consuming 7739
consuming 7741
consuming 7744
consuming 7746
consuming 53
consuming 6820
consuming 7753
consuming 7754
consuming 7755
consuming 7755
consuming 7757
consuming 7759
consuming 7759
consuming 7141
consuming 6702
consuming 7767
consuming 4728
consuming 3981
consuming 7769
consuming 7770
consuming 7349
consuming 4288
consuming 7773
consuming 7775
consuming 7776
consuming 7779
consuming 7781
consuming 7781
consuming 7782
consuming 7782
consuming 7782
consuming 77

consuming 8359
consuming 8312
consuming 8361
consuming 8363
consuming 8364
consuming 8190
consuming 8369
consuming 8369
consuming 7624
consuming 8372
consuming 8343
consuming 8375
consuming 8376
consuming 8377
consuming 8377
consuming 8381
consuming 6146
consuming 136
consuming 3458
consuming 8384
consuming 3738
consuming 8386
consuming 8389
consuming 482
consuming 8392
consuming 5819
consuming 8395
consuming 8396
consuming 8398
consuming 7116
consuming 7515
consuming 8403
consuming 8406
consuming 8406
consuming 8322
consuming 8411
consuming 8411
consuming 8413
consuming 8414
consuming 8415
consuming 8416
consuming 6011
consuming 8418
consuming 8418
consuming 8419
consuming 8419
consuming 8422
consuming 8423
consuming 6907
consuming 8426
consuming 8428
consuming 8428
consuming 3046
consuming 8431
consuming 2338
consuming 8434
consuming 8435
consuming 8436
consuming 8438
consuming 8441
consuming 8441
consuming 8441
consuming 8407
consuming 3328
consuming 8447
consuming 8267
consuming 81

consuming 5729
consuming 9094
consuming 9094
consuming 9094
consuming 9095
consuming 9097
consuming 9098
consuming 9099
consuming 9100
consuming 2666
consuming 7449
consuming 8025
consuming 8258
consuming 9105
consuming 8490
consuming 7499
consuming 2524
consuming 9108
consuming 9109
consuming 9111
consuming 9112
consuming 5326
consuming 9117
consuming 322
consuming 6413
consuming 9120
consuming 2127
consuming 9123
consuming 9123
consuming 8995
consuming 9125
consuming 7596
consuming 4599
consuming 9135
consuming 9136
consuming 9137
consuming 9139
consuming 8866
consuming 8859
consuming 9143
consuming 9144
consuming 9145
consuming 9147
consuming 9148
consuming 375
consuming 9152
consuming 9152
consuming 9153
consuming 9155
consuming 9009
consuming 9160
consuming 9163
consuming 9163
consuming 9165
consuming 9167
consuming 9168
consuming 9169
consuming 9170
consuming 8648
consuming 9172
consuming 9172
consuming 8879
consuming 9175
consuming 9176
consuming 9176
consuming 9177
consuming 18

consuming 9797
consuming 9797
consuming 9588
consuming 9798
consuming 9798
consuming 9799
consuming 9697
consuming 9803
consuming 9805
consuming 8433
consuming 9808
consuming 9810
consuming 6775
consuming 9276
consuming 9440
consuming 9814
consuming 9814
consuming 9730
consuming 9670
consuming 9818
consuming 9818
consuming 9819
consuming 8280
consuming 9823
consuming 9826
consuming 9817
consuming 9828
consuming 9830
consuming 9833
consuming 3742
consuming 9836
consuming 2695
consuming 9841
consuming 9842
consuming 9844
consuming 248
consuming 2727
consuming 9850
consuming 9851
consuming 6511
consuming 9853
consuming 9835
consuming 9856
consuming 9843
consuming 5862
consuming 9860
consuming 9824
consuming 9866
consuming 9867
consuming 1425
consuming 9868
consuming 8922
consuming 9870
consuming 9848
consuming 9872
consuming 9873
consuming 7841
consuming 2348
consuming 9880
consuming 2830
consuming 9882
consuming 3270
consuming 9857
consuming 9885
consuming 199
consuming 7454
consuming 98

consuming 2351
consuming 10553
consuming 4155
consuming 10556
consuming 10558
consuming 10561
consuming 9029
consuming 10562
consuming 976
consuming 10564
consuming 3911
consuming 10567
consuming 10567
consuming 7881
consuming 10569
consuming 2361
consuming 10551
consuming 10573
consuming 10574
consuming 10575
consuming 9642
consuming 8924
consuming 10532
consuming 10577
consuming 10578
consuming 1595
consuming 9686
consuming 2557
consuming 10580
consuming 512
consuming 9862
consuming 1037
consuming 6448
consuming 2783
consuming 2676
consuming 10486
consuming 10585
consuming 10586
consuming 10586
consuming 10166
consuming 10591
consuming 10592
consuming 10593
consuming 4228
consuming 10597
consuming 9286
consuming 10602
consuming 10602
consuming 10603
consuming 10604
consuming 10605
consuming 10607
consuming 10001
consuming 10610
consuming 10611
consuming 10612
consuming 10304
consuming 10144
consuming 10615
consuming 6273
consuming 10618
consuming 10619
consuming 10620
consuming 10621

consuming 10696
consuming 11235
consuming 3703
consuming 11240
consuming 11240
consuming 11241
consuming 11242
consuming 743
consuming 9906
consuming 8412
consuming 8986
consuming 6339
consuming 11252
consuming 11259
consuming 11261
consuming 146
consuming 1710
consuming 8119
consuming 6838
consuming 4672
consuming 7762
consuming 2022
consuming 9064
consuming 9084
consuming 4315
consuming 11267
consuming 3378
consuming 11269
consuming 11270
consuming 9227
consuming 11272
consuming 11274
consuming 11274
consuming 717
consuming 11278
consuming 11279
consuming 11279
consuming 11178
consuming 11281
consuming 11284
consuming 11285
consuming 11286
consuming 11287
consuming 11289
consuming 11292
consuming 11293
consuming 3652
consuming 10018
consuming 11297
consuming 11298
consuming 11299
consuming 11299
consuming 11183
consuming 11135
consuming 1377
consuming 11305
consuming 7497
consuming 11307
consuming 11308
consuming 9786
consuming 9309
consuming 11315
consuming 11317
consuming 11318
con

consuming 11323
consuming 8558
consuming 11185
consuming 7785
consuming 11937
consuming 11937
consuming 11300
consuming 11939
consuming 11506
consuming 10450
consuming 11942
consuming 11901
consuming 11946
consuming 11949
consuming 11950
consuming 11951
consuming 11952
consuming 7875
consuming 11954
consuming 11461
consuming 11956
consuming 4633
consuming 11958
consuming 11731
consuming 11960
consuming 7167
consuming 939
consuming 11963
consuming 292
consuming 11966
consuming 11966
consuming 11967
consuming 11968
consuming 11618
consuming 11970
consuming 10868
consuming 11972
consuming 11972
consuming 11972
consuming 6630
consuming 11974
consuming 11974
consuming 11974
consuming 11975
consuming 11976
consuming 11976
consuming 11530
consuming 11979
consuming 11982
consuming 11983
consuming 11984
consuming 8824
consuming 11986
consuming 11986
consuming 2841
consuming 11990
consuming 11604
consuming 11993
consuming 11994
consuming 11994
consuming 11959
consuming 11969
consuming 11998
cons

consuming 12483
consuming 12492
consuming 12625
consuming 12626
consuming 12627
consuming 12628
consuming 12630
consuming 12632
consuming 12633
consuming 12634
consuming 12634
consuming 10624
consuming 12636
consuming 12636
consuming 9467
consuming 12639
consuming 12642
consuming 12643
consuming 12644
consuming 12645
consuming 12647
consuming 12649
consuming 6903
consuming 12650
consuming 12652
consuming 12131
consuming 12654
consuming 12604
consuming 4971
consuming 12657
consuming 4566
consuming 12615
consuming 12660
consuming 12662
consuming 12662
consuming 6608
consuming 12665
consuming 12666
consuming 3446
consuming 12667
consuming 12668
consuming 12669
consuming 12623
consuming 12569
consuming 12674
consuming 12676
consuming 12678
consuming 12679
consuming 12682
consuming 12683
consuming 12685
consuming 12157
consuming 12687
consuming 12687
consuming 12688
consuming 7444
consuming 12690
consuming 12691
consuming 12504
consuming 12691
consuming 12691
consuming 12637
consuming 10449

consuming 12924
consuming 13079
consuming 13255
consuming 13256
consuming 13257
consuming 13145
consuming 13261
consuming 13261
consuming 13261
consuming 13261
consuming 13263
consuming 13265
consuming 13254
consuming 13267
consuming 13267
consuming 13010
consuming 13269
consuming 13269
consuming 13271
consuming 13273
consuming 13276
consuming 13279
consuming 13222
consuming 13282
consuming 13282
consuming 13282
consuming 13282
consuming 13283
consuming 13284
consuming 13284
consuming 13284
consuming 13285
consuming 13285
consuming 2984
consuming 11319
consuming 13289
consuming 13292
consuming 13294
consuming 13295
consuming 13027
consuming 13299
consuming 13300
consuming 13300
consuming 13301
consuming 13302
consuming 7376
consuming 12191
consuming 13304
consuming 13305
consuming 13305
consuming 13247
consuming 13307
consuming 12025
consuming 13308
consuming 12192
consuming 3807
consuming 4085
consuming 13311
consuming 13194
consuming 13313
consuming 13314
consuming 13316
consuming 13

consuming 10582
consuming 13868
consuming 13870
consuming 13870
consuming 13870
consuming 13871
consuming 13871
consuming 13871
consuming 13873
consuming 13875
consuming 13875
consuming 13875
consuming 6826
consuming 12473
consuming 13878
consuming 13878
consuming 13878
consuming 13879
consuming 9940
consuming 13880
consuming 13886
consuming 13887
consuming 13887
consuming 3633
consuming 13890
consuming 13890
consuming 13890
consuming 13893
consuming 13893
consuming 13894
consuming 13895
consuming 13896
consuming 13897
consuming 13897
consuming 13898
consuming 13899
consuming 13531
consuming 8355
consuming 13904
consuming 13905
consuming 13907
consuming 13908
consuming 13909
consuming 10782
consuming 9897
consuming 12986
consuming 13916
consuming 13381
consuming 8512
consuming 13921
consuming 13922
consuming 13741
consuming 4621
consuming 5182
consuming 5665
consuming 5240
consuming 13929
consuming 9602
consuming 13861
consuming 4200
consuming 13576
consuming 11268
consuming 13935
cons

consuming 14462
consuming 14464
consuming 14465
consuming 14465
consuming 14465
consuming 14465
consuming 14466
consuming 14467
consuming 6443
consuming 14469
consuming 14470
consuming 14470
consuming 14471
consuming 12670
consuming 14473
consuming 14474
consuming 14477
consuming 14478
consuming 14478
consuming 14479
consuming 14480
consuming 14481
consuming 14133
consuming 14484
consuming 14485
consuming 9030
consuming 13484
consuming 13627
consuming 14488
consuming 14491
consuming 14492
consuming 14492
consuming 14492
consuming 14494
consuming 14495
consuming 4178
consuming 14498
consuming 14499
consuming 14502
consuming 14504
consuming 13016
consuming 14506
consuming 14507
consuming 13615
consuming 13620
consuming 14509
consuming 13435
consuming 14513
consuming 14514
consuming 14515
consuming 14516
consuming 11944
consuming 14516
consuming 14516
consuming 14518
consuming 14519
consuming 14520
consuming 14521
consuming 14312
consuming 14524
consuming 14524
consuming 5821
consuming 14

consuming 5330
consuming 14965
consuming 15020
consuming 15021
consuming 15021
consuming 15021
consuming 15021
consuming 15022
consuming 15023
consuming 15024
consuming 15024
consuming 14942
consuming 15026
consuming 15027
consuming 15027
consuming 14880
consuming 15029
consuming 5974
consuming 11249
consuming 4914
consuming 14862
consuming 15035
consuming 14995
consuming 10209
consuming 15041
consuming 15041
consuming 15042
consuming 13632
consuming 13491
consuming 13589
consuming 12601
consuming 5078
consuming 15051
consuming 15052
consuming 9973
consuming 970
consuming 14883
consuming 15057
consuming 15057
consuming 15057
consuming 14611
consuming 15061
consuming 15062
consuming 15062
consuming 15064
consuming 15066
consuming 15068
consuming 9432
consuming 12229
consuming 4954
consuming 15079
consuming 15081
consuming 15081
consuming 15082
consuming 15082
consuming 15083
consuming 15083
consuming 15084
consuming 7955
consuming 14700
consuming 15086
consuming 15045
consuming 15089
co

consuming 15610
consuming 15610
consuming 15612
consuming 15615
consuming 15616
consuming 15617
consuming 15617
consuming 15437
consuming 15619
consuming 15618
consuming 15322
consuming 15376
consuming 15407
consuming 15625
consuming 15050
consuming 15627
consuming 15628
consuming 14609
consuming 15630
consuming 15632
consuming 15633
consuming 15634
consuming 15512
consuming 15637
consuming 15638
consuming 15638
consuming 15033
consuming 710
consuming 15640
consuming 15641
consuming 15641
consuming 14763
consuming 15221
consuming 15644
consuming 15511
consuming 15647
consuming 15648
consuming 15650
consuming 15654
consuming 12019
consuming 15656
consuming 15656
consuming 15658
consuming 15659
consuming 15660
consuming 15352
consuming 15585
consuming 664
consuming 15056
consuming 9214
consuming 15657
consuming 14783
consuming 14724
consuming 15670
consuming 4645
consuming 15673
consuming 15673
consuming 15674
consuming 15492
consuming 15674
consuming 15675
consuming 15285
consuming 1348

### Analyze and export niches to textfile

In [61]:
def analyze_niche(niche):
    total_sales_rank = 0
    num_sales_ranks = 0
    for document in niche['similar_docs']:
        if document_data[document]['salesRank'] != "NA":
            num_sales_ranks += 1
            total_sales_rank += int(document_data[document]['salesRank'])
            if 'best_sales_rank' in niche and int(document_data[document]['salesRank']) < int(niche['best_sales_rank']):
                niche['best_sales_rank'] = document_data[document]['salesRank']
            elif 'best_sales_rank' not in niche:
                niche['best_sales_rank'] = document_data[document]['salesRank']
                
    niche['percent_sales_ranks'] = float(num_sales_ranks) / float(len(niche['similar_docs']))
    if num_sales_ranks > 0:
        niche['average_sales_rank'] = float(total_sales_rank) / float(num_sales_ranks)

def hot_niche(niche):
    if len(niche['similar_docs']) > 5:

        if niche['percent_sales_ranks'] > 0.8 and niche['average_sales_rank'] < 1500000:
            return True
    
        if niche['percent_sales_ranks'] > 0.5 and niche['average_sales_rank'] < 900000:
            return True
    
    return False

num_hot_niches = 0
for niche in niches:
    if len(niches[niche]['similar_docs']) > 0 and niches[niche]['consumed'] is False:
        analyze_niche(niches[niche])
        if hot_niche(niches[niche]):
            num_hot_niches += 1
            niches[niche]['hot'] = True

print(num_hot_niches)
with open("niches.txt", 'w') as f:
    for niche in niches:
        if niches[niche]['hot'] and not niches[niche]['consumed']:
            f.write("number of documents in cluster: " + str(len(niches[niche]['similar_docs'])) + '\n')
            f.write("percent sales ranks: " + str(round(niches[niche]['percent_sales_ranks'],2)) + "\n")
            f.write("average sales rank: " + str(round(niches[niche]['average_sales_rank'],2)) + "\n")
            f.write("best sales rank: " + str(niches[niche]['best_sales_rank']) + "\n")
            for document in niches[niche]['similar_docs']:
                f.write(document_data[document]['asin'] + ', ')
            f.write("\n")
            for document in niches[niche]['similar_docs']:
                f.write(document_data[document]['document'] + "\n")
            f.write("\n")

61


## Option 2: 
soft cosine similarity using fasttext similarity matrix of single tokens (not good for mba because the exact text needs to be the same)

In [None]:
import gensim
# upgrade gensim if you can't import softcossim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess # all this does is split a string into an array and clean out punctuation
print(gensim.__version__)
#> '3.6.0'

# Download the FastText model
og_fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')
print("done")

In [None]:
print(documents[0])
stemmed_documents = [tokenize_and_stem(document) for document in documents ]
print(stemmed_documents[0])

# simple_preprocess

# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary(stemmed_documents) # should be using just documents
print(dictionary)
# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)
print(similarity_matrix)
# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(simple_preprocess(documents[0]))
print(sent_1)

In [None]:
print(dictionary.token2id)

In [None]:
print(similarity_matrix[7])

In [None]:
sent_1 = dictionary.doc2bow(simple_preprocess(documents[0]))
print(sent_1)
sent_2 = dictionary.doc2bow(simple_preprocess(documents[16]))
print(documents[0])
print(documents[18])
print(softcossim(sent_1, sent_2, similarity_matrix))

## K-means clustering

K-means isn't necessarily the best. You must initialize with a pre-determined number of clusters

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

## Hierarchial document clustering

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram
import matplotlib.pyplot as plt


linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=asins);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters