# Clustering Merch Data Shirts
1. read in data file and create large list of indexed data
2. remove texts that pop up in all documents (i.e. stopwords / phrases)
3. clean the documents
4. stem and tokenize the words in each document
5. run all documents through tf-idf vectorizer

To Do:
- read in datafiles better (use different delimiter?)
- clean up input documents better
- possibly create my own tfidf vectorizor that works as a sum rather than a median


## Initialize steps

First we import the necessary libraries.

In [2]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import re
import os
from sklearn import feature_extraction

Import the stemmer and then tokenize and stem the text.

In [3]:
# load nltk's SnowballStemmer as variabled 'stemmer'
stemmer = SnowballStemmer("english")

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if not re.search('[^a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) NOTE: this might be bad
    for token in tokens:
        if not re.search('[^a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens
print(tokenize_and_stem('artwork'))


['artwork']


Now we import the shirt data file and create documents from it. Each document is the title of the shirt followed by it's description.

In [4]:
def clean_string(document):
    document = document.lower()
    default_1 = "Lightweight, Classic fit, Double-needle sleeve and bottom hem".lower()
    default_2 = "Solid colors: 100% Cotton; Heather Grey: 90% Cotton, 10% Polyester; All Other Heathers: 50% Cotton, 50% Polyester Imported Machine wash cold with like colors, dry low heat ".lower()
    document = document.replace(default_1, '').replace(default_2,'')
    
    document.replace("tee shirt","tshirt")
    document.replace("t-shirt", 'tshirt')
    document.replace(' t shirt', 'tshirt')
    document.replace("-", " ")
    
    document.replace("shirt", "")
    document.replace("tshirt", "")
    
    document.replace("officially licensed","")
    
    if re.compile("officially licensed [\w\s]+ (apparel|shirt)").search(document):
        document = re.sub(r'officially licensed [\w\s]+ (apparel|shirt)', '', document)
    
    if re.compile("official [\w\s]+merchandise").search(document):
        document = re.sub(r'official [\w\s]+merchandise', '', document)

    if re.compile("graphic [\w\s\-]+shirt").search(document):
        document = re.sub(r'graphic [\w\s\-]+shirt', '', document)
    
    
    document = BeautifulSoup(document, 'html.parser').getText()
    
    return document

document_data_dict = {}
def read_shirt_data_file(path):
    document_data = []
    with open(path, 'r') as data_file:
        for line in data_file:
            data = {}
            if len(line.replace("\"","").split('|')) == 11:
                for item in line.replace("\"","").split('|'):
                    data[item.split(':',1)[0]] = item.split(':',1)[1]
                data['document'] = clean_string(data['title']) # + ". " + data['description'])

                # Keep track of document data by asin
                if data['asin'] not in document_data_dict:
                    document_data_dict[data['asin']] = data
                    document_data.append(data)
                
    return document_data

document_data = read_shirt_data_file("shirts_newest_nt") # read_shirt_data_file("shirts_featured_nt")
print(document_data[0])
print(len(document_data))

{'salesRank': 'NA', 'asin': 'B07X97D31R', 'title': 'Support Cancer Shirt Prostate Cancer Awareness Tshirt', 'imgUrl': 'https://m.media-amazon.com/images/I/A13usaonutL._CLa%7C2140%2C2000%7C918G0EUHpmL.png%7C0%2C0%2C2140%2C2000%2B0.0%2C0.0%2C2140.0%2C2000.0._UX342_.png', 'trademarked': 'False', 'isMerch': 'True', 'date': 'August 30, 2019', 'unix': '1567123200', 'errorMessage': '[]', 'link': 'https://www.amazon.com/dp/B07X97D31R', 'description': 'Solid colors: 100% Cotton; Heather Grey: 90% Cotton, 10% Polyester; All Other Heathers: 50% Cotton, 50% Polyester Imported Machine wash cold with like colors, dry low heat Are you a Fighter who has or is fighting Prostate Cancer? This Motivational T-shirt is perfect for you. Great shirt for your Light Blue ribbon events, Hospital Visits or To support a loved one with Cancer. This Prostate Cancer Tshirt is a great Birthday or Christmas Gift For Surviors. Show your love with a Light Blue Ribbon. Lightweight, Classic fit, Double-needle sleeve and bo

## Option 1: 
cosine similarity of stems and ngrams (Tf-idf and document similarity)

### Create tfidf vector from documents

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

with open('stop_words.txt','r') as f:
    extra_stop_words = f.read().split('\n')

print(extra_stop_words)

my_stop_words = text.ENGLISH_STOP_WORDS.union(extra_stop_words)
#print(my_stop_words)

tfidf_vectorizer = TfidfVectorizer(max_df=0.02, max_features=10000000,
                                 min_df=0, stop_words=my_stop_words,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform([document['document'] for document in document_data])
#print(tfidf_matrix[0])
print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()
print(len(terms))
#print(stemmed_documents[99])

print(terms[247])



['logo', 'shirt', 'tshirt', 'vintag', 'retro', 'movie', 'officially', 'licensed', 'official', 'offici', 'licens', 'graphic', 'artwork', 'gift', 'men', 'women', '']


  'stop_words.' % sorted(inconsistent))


CPU times: user 4.65 s, sys: 42.7 ms, total: 4.7 s
Wall time: 4.69 s
(16010, 74640)
74640
adjust


In [54]:
# Testing cosine similarity comparisons

from sklearn.metrics.pairwise import cosine_similarity
#print(cosine_similarity(tfidf_matrix))
# print(tfidf_matrix[0])
# print(tfidf_matrix[1])
# print(tfidf_matrix[2])

start = 0
length = 1
dist = cosine_similarity(tfidf_matrix[start:start+length],tfidf_matrix)
print(dist[0])
for doc_vector_index, doc_vector in enumerate(dist):
    print("checking {} for similarities...".format(document_data[start + doc_vector_index]['asin']))
    print("{}".format(document_data[start + doc_vector_index]['document']))
    for similarity_index, doc_similarity in enumerate(doc_vector):
        if doc_similarity > .2:
            print("{} {}".format(doc_similarity, document_data[similarity_index]['document']))
        

# print(tfidf_matrix.shape)
# print(dist)

[1. 0. 0. ... 0. 0. 0.]
checking B07X97D31R for similarities...
support cancer shirt prostate cancer awareness tshirt
1.0 support cancer shirt prostate cancer awareness tshirt
0.22020866476987808 breast cancer awareness support tee
0.2165820341351916 strength against cancer vintage childhood cancer awareness t-shirt
0.21020493217342937 prostate cancer sucks dinosaur trex blue ribbon awareness
0.2140131736540844 breast cancer awareness shirt breast cancer shirts for women t-shirt
0.3377413328599886 i wear light blue for my dad prostate cancer awareness shirt
0.35034916405125105 his fight is my fight i prostate cancer awareness fight gift t-shirt
0.21563231640052763 fuck cancer tshirt awareness for cancer survivor gifts t-shirt
0.3755102199922954 i wear blue for my uncle prostate cancer awareness shirt
0.42907667964294915 wolf still here still fighting prostate cancer awareness t-shirt


### Compute cosine similarity between all doc vectors, and create niches

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
import pprint
pp = pprint.PrettyPrinter(indent=4)

niches = {}

# dist = cosine_similarity(tfidf_matrix)

for doc_vector_index, doc_vector in enumerate(dist):
#     if doc_vector_index > 100:
#         break

    niches[doc_vector_index] = {}
    niches[doc_vector_index]['similar_docs'] = set()
    niches[doc_vector_index]['percent_sales_ranks'] = 0.0
    niches[doc_vector_index]['average_sales_rank'] = 0
    niches[doc_vector_index]['hot'] = False
    niches[doc_vector_index]['consumed'] = False
    for similar_doc_index, similar_doc in enumerate(doc_vector):
        if similar_doc > .25:
            niches[doc_vector_index]['similar_docs'].add(similar_doc_index)
            # if doc_vector_index != similarity_index and similarity_index in niches:
                
    # if float(len(niches[doc_vector_index].intersection(niches[similarity_index])))/float(min(len(niches[similarity_index]),len(niches[doc_vector_index]))) > 0.75
    if int(doc_vector_index) == 2719:
        print("now analyzing 2719")
        print(niches[doc_vector_index]['similar_docs'])
        print(document_data[doc_vector_index]['document'])
        print(niches[1841]['similar_docs'])
        print(document_data[1841]['document'])
    for similar_doc in niches[doc_vector_index]['similar_docs']:
        if int(doc_vector_index) == 2719 and int(similar_doc) == 1841:
            print("now analyzing 2749")
        if doc_vector_index != similar_doc and similar_doc in niches and not niches[similar_doc]['consumed']:

            intersecting_docs = niches[doc_vector_index]['similar_docs'].intersection(niches[similar_doc]['similar_docs'])
            smaller_niche = min(len(niches[similar_doc]['similar_docs']),len(niches[doc_vector_index]['similar_docs']))                                                                          

            if (float(len(intersecting_docs))/float(smaller_niche)) > 0.75:
#                 print("these two share more than 75% of items")
#                 print(float(len(niches[doc_vector_index]['similar_docs'])))
#                 print(doc_vector_index)
#                 print(niches[doc_vector_index]['similar_docs'])
#                 print(document_data[doc_vector_index]['document'])
#                 print(float(len(niches[similar_doc]['similar_docs'])))
#                 print(similar_doc)
#                 print(niches[similar_doc]['similar_docs'])
#                 print(document_data[similar_doc]['document'])
#                 for item in niches[similar_doc]:
#                     print(item)
#                     print(document_data[item]['document'])
                if len(niches[doc_vector_index]['similar_docs']) >= len(niches[similar_doc]['similar_docs']):
                    niches[doc_vector_index]['consumed'] = True
                else:
                    niches[similar_doc]['consumed'] = True


                    

# pp.pprint(niches)

    
            
            


now analyzing 2719
{13567, 10021, 5990, 8007, 5128, 2506, 1005, 7597, 6132, 6777, 2719}
straight outta 2016 3rd birthday 3 years age vintage gifts t-shirt
{1841, 13567}
established since september 2006 straight outta 13 years old


### Analyze and export niches to textfile

In [36]:
def analyze_niche(niche):
    total_sales_rank = 0
    num_sales_ranks = 0
    for document in niche['similar_docs']:
        if document_data[document]['salesRank'] != "NA":
            num_sales_ranks += 1
            total_sales_rank += int(document_data[document]['salesRank'])
            if 'best_sales_rank' in niche and int(document_data[document]['salesRank']) < int(niche['best_sales_rank']):
                niche['best_sales_rank'] = document_data[document]['salesRank']
            elif 'best_sales_rank' not in niche:
                niche['best_sales_rank'] = document_data[document]['salesRank']
                
    niche['percent_sales_ranks'] = float(num_sales_ranks) / float(len(niche['similar_docs']))
    if num_sales_ranks > 0:
        niche['average_sales_rank'] = float(total_sales_rank) / float(num_sales_ranks)

def hot_niche(niche):
    if len(niche['similar_docs']) > 5:

        if niche['percent_sales_ranks'] > 0.8 and niche['average_sales_rank'] < 1500000:
            return True
    
        if niche['percent_sales_ranks'] > 0.5 and niche['average_sales_rank'] < 900000:
            return True
    
    return False

num_hot_niches = 0
for niche in niches:
    if len(niches[niche]['similar_docs']) > 0 and niches[niche]['consumed'] is False:
        analyze_niche(niches[niche])
        if hot_niche(niches[niche]):
            num_hot_niches += 1
            niches[niche]['hot'] = True

print(num_hot_niches)
with open("niches.txt", 'w') as f:
    for niche in niches:
        if niches[niche]['hot'] and not niches[niche]['consumed']:
            f.write('index: ' + str(niche) + '\n')
            f.write("number of documents in cluster: " + str(len(niches[niche]['similar_docs'])) + '\n')
            f.write("percent sales ranks: " + str(round(niches[niche]['percent_sales_ranks'],2)) + "\n")
            f.write("average sales rank: " + str(round(niches[niche]['average_sales_rank'],2)) + "\n")
            f.write("best sales rank: " + str(niches[niche]['best_sales_rank']) + "\n")
            for document in niches[niche]['similar_docs']:
                f.write(document_data[document]['asin'] + ', ')
            f.write("\n")
            for document in niches[niche]['similar_docs']:
                f.write(document_data[document]['document'] + "\n")
            f.write("\n")

63
