In [18]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [23]:
import pandas as pd
import gzip
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')
import re
import spacy
from string import punctuation
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
punctuation = punctuation + '—–«»·№˚¼°-;&'

In [46]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [47]:
parse('Luxury_Beauty.json.gz')

<generator object parse at 0x7f0bfc1c1190>

In [48]:
reviews = getDF('/content/drive/MyDrive/Luxury_Beauty.json.gz')

In [49]:
good_reviews = reviews[['reviewText', 'summary', 'asin']]

In [50]:
parse('meta_Luxury_Beauty.json.gz')

<generator object parse at 0x7f0ba0aff0b0>

In [51]:
meta = getDF('/content/drive/MyDrive/meta_Luxury_Beauty.json.gz')

In [52]:
good_meta = meta[['description', 'title', 'asin']]

In [54]:
all_data = pd.DataFrame()
all_data = pd.merge(good_reviews, good_meta, on='asin')
all_data['reviewText'] = all_data['reviewText'].fillna(np.nan)
all_data['summary'] = all_data['summary'].fillna(np.nan)
all_data['description'] = all_data['description'].fillna(np.nan)
all_data['title'] = all_data['title'].fillna(np.nan)
all_data['reviewText'] = all_data['reviewText'].dropna()
all_data['summary'] = all_data['summary'].dropna()
all_data['description'] = all_data['description'].dropna()
all_data['title'] = all_data['title'].dropna()
all_data = all_data[~all_data['description'].isna()]
all_data = all_data[~all_data['summary'].isna()]
all_data = all_data[~all_data['reviewText'].isna()]
all_data = all_data[~all_data['title'].isna()]

In [55]:
desc = []
for description in all_data['description']:
    descriptions = ' '.join(description)
    desc.append(descriptions)
all_data['description'] = desc

In [56]:
def preprocessing(data, *columns):
  for column in columns:
    print(column)
    all_obj = []
    for obj in tqdm(data[column]):
      lemmatized_obj = ' '.join([token.lemma_ for token in nlp(obj)])
      lemmatized_obj = lemmatized_obj.lower()
      for i in punctuation:
        lemmatized_obj = lemmatized_obj.replace(i, ' ')
        lemmatized_obj = lemmatized_obj.replace('  ', ' ')
      all_obj.append(lemmatized_obj)
    data[column] = all_obj
  return data

In [57]:
data_preprocessed = preprocessing(all_data.groupby('asin').first().reset_index(), 'description', 'summary', 'reviewText', 'title')

description


100%|██████████| 12111/12111 [02:44<00:00, 73.57it/s]


summary


100%|██████████| 12111/12111 [00:37<00:00, 321.59it/s]


reviewText


100%|██████████| 12111/12111 [01:18<00:00, 154.08it/s]


title


100%|██████████| 12111/12111 [00:41<00:00, 288.43it/s]


In [58]:
data_preprocessed

Unnamed: 0,asin,reviewText,summary,description,title
0,B00004U9V2,i buy two of these 8 5 fl oz hand cream and ne...,dispenser do not work,after a long day of handle thorny situation ou...,crabtree amp evelyn gardener s ultra moisturis...
1,B0000531EN,i have only use it once so far so not sure how...,over price,if you have not experience the pleasure of bat...,ahava bath salts
2,B0000532JH,they add ingredient so it be not 100 pure mud ...,product have get bad,rich black mineral mud harvest from the bank o...,ahava dead sea mineral mud 8 5 oz pack of 4
3,B00005A77F,this have get to be the very good soap for gar...,good soap go,this liquid soap with convenient pump dispense...,crabtree amp evelyn hand soap gardeners 10 1 f...
4,B00005NDTD,love this lotion,four star,remember why you love your favorite blanket th...,soy milk hand crme
...,...,...,...,...,...
12106,B01HIQEOLO,color be perfect go on smooth,five star,cnd shellac be design to be use as a system fe...,cnd shellac leather satchel
12107,B01HIQHQU0,not impressed with this very dull gray blue co...,blah,cnd shellac be design to be use as a system fe...,cnd shellac power polish denim patch
12108,B01HIQIEYC,i receive lot of compliment when i wear this c...,fantastic color,cnd craft culture collection patina buckle di...,cnd shellac power polish patina buckle
12109,B01HJ2UY0W,the perfume be good but the spray head break off,spray head break off within a month,the i be juicy couture girl be once again take...,juicy couture i love juicy couture 1 7 fl oz p...


**Способы для поиска упоминания товаров в отзывах**

1.   Использовать шаблоны, которые задают грамматические и синтаксические правила, осовывающиеся на частях речи;
2.   Поиск по n-граммам через формирование шаблонов, содержащих категории. Но категории часто оказываются пустыми -> работать с ними не стоит. Можно посмотреть на сами категории, то есть что из себя представляют, какрй ассортимент охватывают и тд;
3. Создать словари с ключевыми словами из пересечения description и title. Если ключевое слово description есть в title - включаем его в словарь. Словарь можно дополнить путем использования эмбеддингов: ищем похожие слова и добавляем в словарь. Тут могут возникнуть проблемы с ключевыми словами и частотностью отдельных слов в целом.



**Создаем словарь с ключевыми словами**

In [59]:
 vectorizer = TfidfVectorizer()

Попробуем использовать Tf-Idf, так как в первой домашней работе он лучше всего справился с выделением ключевых слов.

In [60]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [61]:
def tfidf_keywords(text):
  if len(text) <= 1:
    return 'NO KEYWORDS'
  else:
    X = vectorizer.fit_transform(text)
    feature_names = np.array(vectorizer.get_feature_names())
    denselist = np.array(X.todense())
    indices = denselist.argsort()[0]
    text_keywords = list(feature_names[indices[::-1][:20]])
    return text_keywords

In [62]:
tfidf_keywords_list = []
for text in tqdm(data_preprocessed['reviewText']):
  tfidf_keywords_list.append(tfidf_keywords(text.split()))

100%|██████████| 12111/12111 [00:18<00:00, 670.61it/s]


In [63]:
df_analysis = pd.DataFrame()
df_analysis['title'] = data_preprocessed['title']
df_analysis['keywords'] = tfidf_keywords_list
df_analysis = df_analysis.loc[df_analysis['keywords'] != 'NO KEYWORDS'].reset_index()
df_analysis = df_analysis.drop(columns='index')
df_analysis

Unnamed: 0,title,keywords
0,crabtree amp evelyn gardener s ultra moisturis...,"[work, do, neither, much, it, hand, get, fl, d..."
1,ahava bath salts,"[will, could, here, have, great, good, give, f..."
2,ahava dead sea mineral mud 8 5 oz pack of 4,"[they, weird, not, add, and, anymore, be, have..."
3,crabtree amp evelyn hand soap gardeners 10 1 f...,"[this, wrong, for, in, house, hope, have, hand..."
4,soy milk hand crme,"[love, this, lotion]"
...,...,...
11823,cnd shellac leather satchel,"[color, smooth, perfect, on, go, be]"
11824,cnd shellac power polish denim patch,"[not, with, impressed, blue, but, cnd, color, ..."
11825,cnd shellac power polish patina buckle,"[your, you, and, angle, at, color, compliment,..."
11826,juicy couture i love juicy couture 1 7 fl oz p...,"[the, spray, perfume, off, head, good, but, br..."


In [64]:
kw_freq = {}
for i in range(len(df_analysis)):
  for kw in df_analysis['keywords'][i]:
    if kw in df_analysis['title'][i].split():
      if kw in kw_freq.keys():
        kw_freq[kw] += 1
      else:
        kw_freq[kw] = 1

In [65]:
sorted_kw = sorted(kw_freq.items(), key=lambda kv: kv[1], reverse=True)
sorted_kw

[('hair', 446),
 ('for', 362),
 ('cream', 231),
 ('and', 226),
 ('eye', 161),
 ('brush', 161),
 ('lip', 132),
 ('shampoo', 127),
 ('color', 122),
 ('conditioner', 115),
 ('hand', 109),
 ('nail', 101),
 ('gel', 101),
 ('skin', 93),
 ('body', 92),
 ('lotion', 84),
 ('face', 83),
 ('in', 72),
 ('iron', 68),
 ('the', 68),
 ('cleanser', 64),
 ('be', 59),
 ('dry', 58),
 ('polish', 54),
 ('dryer', 54),
 ('foundation', 50),
 ('candle', 50),
 ('curl', 49),
 ('mask', 49),
 ('oil', 48),
 ('soap', 47),
 ('with', 47),
 ('mascara', 39),
 ('spray', 39),
 ('of', 37),
 ('concealer', 35),
 ('blush', 34),
 ('gloss', 34),
 ('moisturizer', 32),
 ('lipstick', 31),
 ('balm', 30),
 ('jane', 28),
 ('iredale', 28),
 ('hold', 28),
 ('zoya', 28),
 ('lavender', 28),
 ('occitane', 27),
 ('eyeliner', 27),
 ('elemis', 26),
 ('oz', 26),
 ('lash', 25),
 ('kit', 25),
 ('evelyn', 24),
 ('facial', 24),
 ('butter', 24),
 ('crabtree', 23),
 ('makeup', 23),
 ('powder', 23),
 ('bath', 23),
 ('black', 23),
 ('serum', 23),
 ('b

Кажется, что наиболее подходящие слова:

1. 'cream', 231
2. 'brush', 161
3. 'shampoo', 127
4. 'conditioner', 115
5. 'gel', 101
6. 'lotion', 84
7. 'cleanser', 64
8. 'polish', 54
9. 'dryer', 54
10. 'foundation', 50
11. 'candle', 50
12. 'mask', 49
13. 'oil', 48
14. 'soap', 47
15. 'mascara', 39
16. 'spray', 39
17. 'concealer', 35
18. 'blush', 34
19. 'gloss', 34
20. 'moisturizer', 32
21. 'lipstick', 31
22. 'balm', 30

In [67]:
lux_list = ['cream', 'brush', 'shampoo', 'conditioner', 'gel', 'lotion', 'cleanser', 
            'polish', 'dryer', 'foundation', 'candle', 'mask', 'oil', 'soap', 
            'mascara', 'spray', 'concealer', 'blush', 'gloss', 'moisturizer', 'lipstick', 'balm']

In [68]:
ft = gensim.downloader.load('fasttext-wiki-news-subwords-300')



In [69]:
full_lux = []
for lux in lux_list:
  if lux not in full_lux:
    full_lux.append(lux)
    index = 0
    if lux in ft:
      while index != -1:
        simmilar = ft.most_similar(lux)[index][0]
        if lux == preprocessing(simmilar):
          index += 1
        else:
          full_lux.append(simmilar)
          index = -1

In [70]:
full_lux = set(full_lux)

**Биграммы**

In [71]:
bigrams = []

for i, review in enumerate(tqdm(data_preprocessed['reviewText'])):
  review = review.split()
  for lux in full_lux:
    line = []
    for i, word in enumerate(review):
      if word == lux:
        line.append(i)
    if line:
      for i in line:
        if i != 0:
          bigrams.append(review[(i - 1):(i + 1)])
        if i != len(review) - 1:
          bigrams.append(review[i:(i + 2)])

100%|██████████| 12111/12111 [00:03<00:00, 3689.73it/s]


In [72]:
bigrams

[['hand', 'cream'],
 ['cream', 'and'],
 ['hand', 'cream'],
 ['cream', 'be'],
 ['good', 'soap'],
 ['soap', 'for'],
 ['evelyn', 'soap'],
 ['soap', 'for'],
 ['pump', 'soap'],
 ['soap', 'could'],
 ['wrong', 'soap'],
 ['soap', 'hope'],
 ['this', 'lotion'],
 ['different', 'lotion'],
 ['lotion', 'but'],
 ['try', 'cream'],
 ['cream', 'natural'],
 ['mineral', 'cream'],
 ['cream', 'yoga'],
 ['generic', 'moisturizer'],
 ['moisturizer', 'for'],
 ['regular', 'moisturizer'],
 ['moisturizer', 'it'],
 ['blow', 'dryer'],
 ['dryer', 'speed'],
 ['counter', 'cream'],
 ['cream', 'i'],
 ['nail', 'polish'],
 ['polish', 'once'],
 ['nail', 'polish'],
 ['polish', 'it'],
 ['this', 'polish'],
 ['polish', 'under'],
 ['this', 'polish'],
 ['polish', 'i'],
 ['good', 'polish'],
 ['polish', 'i'],
 ['this', 'polish'],
 ['polish', 'as'],
 ['glitter', 'polish'],
 ['polish', 'this'],
 ['other', 'polish'],
 ['polish', 'but'],
 ['the', 'mascara'],
 ['mascara', 'when'],
 ['night', 'cream'],
 ['cream', 'it'],
 ['from', 'founda

In [73]:
bigram_scores = nltk.collocations.BigramAssocMeasures()
bigrams_finder = nltk.collocations.BigramCollocationFinder.from_documents(bigrams)
bigrams_finder.apply_freq_filter(10)

In [74]:
DICE = bigrams_finder.score_ngrams(bigram_scores.dice)
PMI = bigrams_finder.score_ngrams(bigram_scores.pmi)
CHI_SQ = bigrams_finder.score_ngrams(bigram_scores.chi_sq)

In [75]:
for i in DICE[:20]:
  print(i)

(('lip', 'balm'), 0.2980769230769231)
(('lip', 'gloss'), 0.296551724137931)
(('nail', 'polish'), 0.28950276243093925)
(('hair', 'dryer'), 0.21267893660531698)
(('blow', 'dryer'), 0.19950124688279303)
(('this', 'shampoo'), 0.13285024154589373)
(('shower', 'gel'), 0.12857142857142856)
(('shampoo', 'and'), 0.11399832355406538)
(('the', 'brush'), 0.09992313604919292)
(('and', 'conditioner'), 0.08982683982683982)
(('the', 'shampoo'), 0.08571428571428572)
(('brush', 'be'), 0.08539944903581267)
(('hair', 'spray'), 0.08442211055276382)
(('this', 'lotion'), 0.08288639687957094)
(('this', 'cream'), 0.079816813869807)
(('the', 'lotion'), 0.07666506947771921)
(('a', 'moisturizer'), 0.07333333333333333)
(('my', 'foundation'), 0.0718562874251497)
(('lotion', 'be'), 0.07095610342754059)
(('facial', 'cleanser'), 0.07085346215780998)


In [76]:
for j in PMI[:20]:
  print(j)

(('blow', 'dryer'), 6.3572658975193)
(('lip', 'gloss'), 6.018686025479756)
(('mud', 'mask'), 5.825736508378585)
(('lip', 'balm'), 5.7676444604767525)
(('shower', 'gel'), 5.311053643353663)
(('nail', 'polish'), 5.209566950592166)
(('liquid', 'foundation'), 5.100108058022174)
(('hair', 'dryer'), 5.082102269967857)
(('gentle', 'cleanser'), 5.073587181769481)
(('polish', 'remover'), 5.070961712362658)
(('essential', 'oil'), 4.916076110384489)
(('tinted', 'moisturizer'), 4.8518706541164605)
(('an', 'oil'), 4.723431032442093)
(('coconut', 'oil'), 4.6750680108806915)
(('spray', 'bottle'), 4.459095027939842)
(('pearson', 'brush'), 4.328236848907768)
(('brush', 'head'), 4.190733325157831)
(('bristle', 'brush'), 4.135591770965373)
(('facial', 'cleanser'), 4.073587181769481)
(('badger', 'brush'), 4.065202443073973)


In [77]:
for k in CHI_SQ[:20]:
  print(k)

(('nail', 'polish'), 4729.864038979352)
(('lip', 'balm'), 3300.130300048315)
(('blow', 'dryer'), 3242.769145090593)
(('lip', 'gloss'), 2728.6367056919976)
(('shower', 'gel'), 1737.7617434083265)
(('hair', 'dryer'), 1686.1477007373014)
(('mud', 'mask'), 891.049640592884)
(('eye', 'cream'), 755.1504891417104)
(('hand', 'cream'), 571.4149568828194)
(('polish', 'remover'), 552.2534434075402)
(('liquid', 'foundation'), 526.5064172610499)
(('shaving', 'cream'), 460.7244495848057)
(('night', 'cream'), 459.11346896139077)
(('hair', 'spray'), 410.84426968196817)
(('gentle', 'cleanser'), 355.4041610487329)
(('an', 'oil'), 354.29325188072164)
(('spray', 'bottle'), 351.0389078383762)
(('essential', 'oil'), 350.4384088964927)
(('body', 'lotion'), 338.87659856843874)
(('facial', 'cleanser'), 334.58045441241484)


**Группировка коллокаций - биграммы**

In [78]:
collocations_bigrams = {}
for lux in full_lux:
  collocations_bigrams[lux] = []
for bigram in DICE:
  words = set(list(bigram[0])) & full_lux
  for word in words:
    collocations_bigrams[word].append(' '.join(bigram[0]))
for lux in full_lux:
  collocations_bigrams[lux] = list(set(collocations_bigrams[lux]))
for word in ['cream', 'brush', 'shampoo', 'conditioner', 'gel', 'lotion', 'cleanser']:
  print(word)
  for i in collocations_bigrams[word]:
    print('\t'+i)

cream
	shaving cream
	face cream
	and cream
	cream have
	cream this
	cream and
	cc cream
	day cream
	cream for
	cream i
	a cream
	cream in
	the cream
	night cream
	cream do
	cream which
	body cream
	cream blush
	hand cream
	lotion cream
	cream the
	cream but
	cream be
	cream to
	cream it
	cream with
	neck cream
	eye cream
	cream that
	cream on
	bb cream
	this cream
	cream cleanser
	shave cream
brush
	brush head
	brush i
	good brush
	bristle brush
	to brush
	brush be
	foundation brush
	brush the
	these brush
	a brush
	my brush
	brush my
	pearson brush
	brush have
	brush it
	brush that
	badger brush
	and brush
	brush will
	the brush
	brush on
	brush do
	brush in
	blush brush
	brush to
	brush but
	brush from
	brush and
	brush for
	great brush
	this brush
shampoo
	the shampoo
	shampoo be
	shampoo it
	a shampoo
	shampoo in
	shampoo conditioner
	great shampoo
	shampoo leave
	shampoo for
	good shampoo
	shampoo to
	of shampoo
	other shampoo
	to shampoo
	shampoo i
	shampoo and
	shampoo but
	thi

Кажется, что особых различий между DICE, PMI и CHI_SQ нет. На мой взгляд DICE работает немного лучше, если посмотреть, например, на cleanser. Особенно для cream он выделяет неплохие сочетания: назначения, разные прилагательные отношения к продукты, прилагательные, относящиеся к описанию самого продукта и тд.

**Триграммы**

In [79]:
trigrams = []

for i, review in enumerate(tqdm(data_preprocessed['reviewText'])):
  review = review.split()
  for lux in full_lux:
    line = []
    for i, word in enumerate(review):
      if word == lux:
        line.append(i)
    if line:
      for i in line:
        if i != 0:
          trigrams.append(review[(i - 1):(i + 2)])

100%|██████████| 12111/12111 [00:02<00:00, 4140.91it/s]


In [80]:
trigrams

[['hand', 'cream', 'and'],
 ['hand', 'cream', 'be'],
 ['good', 'soap', 'for'],
 ['evelyn', 'soap', 'for'],
 ['pump', 'soap', 'could'],
 ['wrong', 'soap', 'hope'],
 ['this', 'lotion'],
 ['different', 'lotion', 'but'],
 ['try', 'cream', 'natural'],
 ['mineral', 'cream', 'yoga'],
 ['generic', 'moisturizer', 'for'],
 ['regular', 'moisturizer', 'it'],
 ['blow', 'dryer', 'speed'],
 ['counter', 'cream', 'i'],
 ['nail', 'polish', 'once'],
 ['nail', 'polish', 'it'],
 ['this', 'polish', 'under'],
 ['this', 'polish', 'i'],
 ['good', 'polish', 'i'],
 ['this', 'polish', 'as'],
 ['glitter', 'polish', 'this'],
 ['other', 'polish', 'but'],
 ['the', 'mascara', 'when'],
 ['night', 'cream', 'it'],
 ['from', 'foundation', 'up'],
 ['silkcoat', 'balm', 'i'],
 ['silkcoat', 'balm', 'only'],
 ['silkcoat', 'balm', 'it'],
 ['good', 'lotion', 'ever'],
 ['of', 'foundation', 'and'],
 ['my', 'cleanser', 'this'],
 ['this', 'soap', 'be'],
 ['too', 'soap', 'disappear'],
 ['novelty', 'soap'],
 ['algae', 'cleanser', 'fir

In [81]:
trigrams_scores = nltk.collocations.TrigramAssocMeasures()
trigrams_finder = nltk.collocations.TrigramCollocationFinder.from_documents(trigrams)
trigrams_finder.apply_freq_filter(5)

In [82]:
JAC = trigrams_finder.score_ngrams(trigrams_scores.jaccard)
PMI = trigrams_finder.score_ngrams(trigrams_scores.pmi)
CHI_SQ = trigrams_finder.score_ngrams(trigrams_scores.chi_sq)

In [83]:
for l in JAC[:20]:
  print(l)

(('nail', 'polish', 'remover'), 0.02689486552567237)
(('lip', 'gloss', 'do'), 0.01718213058419244)
(('the', 'brush', 'be'), 0.015116811726981219)
(('the', 'shampoo', 'and'), 0.014446227929373997)
(('nail', 'polish', 'i'), 0.012476007677543186)
(('this', 'shampoo', 'be'), 0.012304779933743492)
(('nail', 'polish', 'but'), 0.011904761904761904)
(('blow', 'dryer', 'i'), 0.010856453558504222)
(('the', 'lotion', 'be'), 0.009571788413098237)
(('lip', 'balm', 'i'), 0.009302325581395349)
(('nail', 'polish', 'and'), 0.008955223880597015)
(('this', 'shampoo', 'for'), 0.008094981111710739)
(('a', 'spray', 'bottle'), 0.007905138339920948)
(('hair', 'spray', 'or'), 0.007886435331230283)
(('this', 'brush', 'be'), 0.0078125)
(('lip', 'balm', 'and'), 0.007805724197745013)
(('nail', 'polish', 'be'), 0.007759456838021339)
(('this', 'lotion', 'be'), 0.007731958762886598)
(('a', 'moisturizer', 'that'), 0.0075107296137339056)
(('good', 'lotion', 'ever'), 0.00744047619047619)


In [84]:
for m in PMI[:20]:
  print(m)

(('nail', 'polish', 'remover'), 12.33175337282545)
(('lip', 'gloss', 'do'), 10.871682313229591)
(('blow', 'dryer', 'i'), 9.885350880430416)
(('good', 'lotion', 'ever'), 9.72046620838524)
(('a', 'spray', 'bottle'), 9.642293347668023)
(('hair', 'spray', 'or'), 8.681650147722422)
(('lip', 'gloss', 'be'), 8.567084096880144)
(('lip', 'balm', 'i'), 8.473147215829744)
(('neck', 'cream', 'i'), 8.410180684660943)
(('nail', 'polish', 'but'), 8.386894927017913)
(('lip', 'balm', 'and'), 8.11688342597607)
(('the', 'spray', 'bottle'), 7.89740809020558)
(('lip', 'gloss', 'the'), 7.839739420726836)
(('lip', 'balm', 'be'), 7.808092196383939)
(('shower', 'gel', 'be'), 7.76622036765621)
(('hair', 'dryer', 'i'), 7.646713128904082)
(('the', 'brush', 'itself'), 7.517904041050514)
(('nail', 'polish', 'i'), 7.510985274742552)
(('hair', 'dryer', 'and'), 7.342916758944543)
(('shower', 'gel', 'and'), 7.227014690693387)


In [85]:
for n in CHI_SQ[:20]:
  print(n)

(('nail', 'polish', 'remover'), 62637.1268762811)
(('lip', 'gloss', 'do'), 12515.28569333872)
(('blow', 'dryer', 'i'), 11536.98772387564)
(('nail', 'polish', 'but'), 8622.472852264187)
(('nail', 'polish', 'i'), 8174.849276576276)
(('nail', 'polish', 'and'), 7378.429605284583)
(('nail', 'polish', 'be'), 7249.469548080384)
(('nail', 'polish', 'it'), 7224.762907657892)
(('lip', 'balm', 'i'), 6684.117692018052)
(('lip', 'balm', 'and'), 6245.846976421951)
(('lip', 'balm', 'be'), 5439.943329796501)
(('lip', 'gloss', 'be'), 5126.601215773154)
(('a', 'spray', 'bottle'), 5019.063809101074)
(('good', 'lotion', 'ever'), 4966.210830762864)
(('lip', 'gloss', 'the'), 4434.252742331756)
(('hair', 'dryer', 'i'), 3292.2922536057977)
(('hair', 'dryer', 'and'), 3148.0092526044245)
(('shower', 'gel', 'be'), 3133.3581567824754)
(('shower', 'gel', 'and'), 2815.887079795251)
(('hair', 'spray', 'or'), 2537.4466906685857)


**Группировка коллокаций - триграммы**

In [86]:
collocations_trigrams = {}
for lux in full_lux:
  collocations_trigrams[lux] = []
for trigram in PMI:
  words = set(list(trigram[0])) & full_lux
  for word in words:
    collocations_trigrams[word].append(' '.join(trigram[0]))
for lux in full_lux:
  collocations_trigrams[lux] = list(set(collocations_trigrams[lux]))
for word in ['cream', 'brush', 'shampoo', 'conditioner', 'gel', 'lotion', 'cleanser']:
  print(word)
  for i in collocations_trigrams[word]:
    print('\t'+i)

cream
	this cream have
	hand cream i
	hand cream be
	this cream it
	this cream on
	night cream i
	this cream do
	eye cream be
	eye cream i
	this cream be
	eye cream and
	shaving cream and
	the cream and
	a cream that
	night cream and
	the cream be
	this cream for
	the cream have
	the cream i
	this cream i
	shaving cream i
	shave cream be
	shave cream and
	neck cream i
brush
	the brush i
	the brush be
	the brush and
	the brush itself
	this brush be
	the brush that
shampoo
	this shampoo have
	this shampoo i
	the shampoo conditioner
	this shampoo my
	this shampoo be
	dry shampoo i
	this shampoo it
	this shampoo work
	the shampoo and
	this shampoo and
	this shampoo do
	good shampoo i
	the shampoo be
	this shampoo for
conditioner
	the shampoo conditioner
	this conditioner be
	and conditioner i
	this conditioner i
	the conditioner be
gel
	the gel be
	shower gel and
	shower gel be
lotion
	this lotion i
	body lotion i
	this lotion it
	good lotion ever
	this lotion be
	the lotion be
	the lotion

В данном случае JAC справился не очень хорошо, остальные примерно одинаково, но, кажется, что результаты PMI выглядят получше. Опять же можно наблюдать лучшие результаты на cream. Выделяет триграммы с назначением, отношением к продукту, но довольно много предлогов и прочих таких вещей. Еще выделяет вид продукта (например ночной крем).