In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans

In [2]:
# Basic NLP with NLTK
from nltk.corpus import brown
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from string import punctuation
import json
import nltk
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

with open("en.json") as json_data:
    en_json = json.load(json_data)

stopwords_json_en = set(en_json)
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)
# Combine the stopwords. 
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct)

porter = PorterStemmer()
wnl = WordNetLemmatizer()


[nltk_data] Downloading package brown to /Users/yue/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yue/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/yue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
data = pd.read_csv('dataset.csv', index_col = 0)
def rebuild(data):
    data_list = []
    for i in range(data.shape[0]):
        name = data.at[i,'name']
        desc = data.at[i,'description']
        if type(desc) is float:
            if type(data.at[i,'label_1']) is not float:
                labels = data.at[i,'label_1'] + ' ' + data.at[i,'label_2'] + ' ' + data.at[i,'label_3']
                desc = labels + ' ' + name
            else:
                desc = name
        dic = {'name':name,'content': desc}
        data_list.append(dic)
    df = pd.DataFrame.from_dict(data_list)
    return df

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

def to_count_vec(string):
    # Input: str, i.e. document/sentence
    # Output: list(str) , i.e. list of lemmas
    analysis = [word for word in lemmatize_sent(string) 
       if word not in stoplist_combined
       and not word.isdigit() ]
    return analysis

def my_cluster(dataframe, num = 100):
    tmp = dataframe['content'].values.astype('U')
    # from nltk import sent_tokenize, word_tokenize
    # count_vect = CountVectorizer(stop_words=stoplist_combined, tokenizer=word_tokenize)
    vectorizer = CountVectorizer(analyzer=to_count_vec)     # same as above
    X = vectorizer.fit_transform(tmp)
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf = transformer.fit_transform(X)
    
    km = KMeans(n_clusters=num)
    km.fit(tfidf)
    clusters = km.labels_.tolist()
#     answer={'answer_body':document, 'cluster':clusters} 
    frame=pd.DataFrame({'name':dataframe['name'],'content': tmp, 'cluster': clusters}, columns=['name','content','cluster'])
    return frame.sort_values(by='cluster', ascending=False)

In [4]:
df = rebuild(data)
g_frame = my_cluster(df,300)

In [5]:
g_frame

Unnamed: 0,name,content,cluster
17493,sea cuisine squid tubes frozen,Frozen Frozen Seafood Frozen Prawns & Squid se...,99
15305,kiwi crush frozen fruit drink kiwi gold concen...,Frozen Frozen Fruit & Drink Frozen Fruit Drink...,99
17450,magnum ice cream chocolate raspberry,Frozen Ice Cream & Sorbet Tubs magnum ice crea...,99
18322,streets magnum mini ice cream double caramel e...,Frozen Ice Cream & Sorbet Single Serve & Multi...,99
18317,talleys corn cobs,"Frozen Frozen Vegetables Frozen Peas, Corn & C...",99
...,...,...,...
7445,clairol herbal essence conditioner normal,Health & Beauty Hair Care Condition & Shine cl...,0
8295,tresemme conditioner hydration boost,Health & Beauty Hair Care Condition & Shine tr...,0
8312,clairol herbal essence conditioner colour,Health & Beauty Hair Care Condition & Shine cl...,0
8723,tresemme conditioner keratin smooth,Health & Beauty Hair Care Condition & Shine tr...,0
