### <font color='green'>Importing data from GCP</font> 

In [1]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'dataimpact-rd'
!gcloud config set project {project_id}


Updated property [core/project].


In [None]:
# Download the file from a given Google Cloud Storage bucket.
!gsutil cp gs://di_data_sas/EN/US/Amazon/Data/periode_11/amazon_ml_opinions_topics.csv /tmp/amazon_ml_opinions_topics.csv
  
# Print the result to make sure the transfer worked.
!head -n 5 /tmp/amazon_ml_opinions_topics.csv

Copying gs://di_data_sas/EN/US/Amazon/Data/periode_11/amazon_ml_opinions_topics.csv...
\ [1 files][  1.7 GiB/  1.7 GiB]   57.5 MiB/s                                   
Operation completed over 1 objects/1.7 GiB.                                      
asin,average,review_body,review_date,review_likes,review_rating,review_title,five_star,four_star,one_star,pp_date,three_star,two_star,refpe,text_clean,title_clean,ml_score,text,ml_topic,opinion
B071F4PVKJ,0.0,the seam on the left sleeve broke in 2 days.  it was likely just an anomaly because the knitting is very high quality otherwise.,2019-01-04,0,2.0,"overall nice, but mine had a bad seam",,,,2019-11-31,,,1840002706176,seam left sleeve broke days likely anomaly knitting high quality otherwise,overall nice mine bad seam,-1.0,seam left sleeve broke days likely anomaly knitting high quality otherwise overall nice mine bad seam,['delivery'],['']
B071F4PVKJ,0.0,loved everything about this sweater..it looks adorable on my dog and it was everyth

In [None]:
!gsutil cp gs://di_data_sas/EN/US/Walmart/Data/2020_periode_1/walmart_ml_opinions.csv /tmp/walmart_ml_opinions_topics.csv


Copying gs://di_data_sas/EN/US/Walmart/Data/2020_periode_1/walmart_ml_opinions.csv...
| [1 files][415.6 MiB/415.6 MiB]                                                
Operation completed over 1 objects/415.6 MiB.                                    


In [None]:
!gsutil cp  gs://di_data_sas/EN/US/Target/Data/2020_periode_1/target_ml_opinions.csv /tmp/target_ml_opinions_topics.csv

Copying gs://di_data_sas/EN/US/Target/Data/2020_periode_1/target_ml_opinions.csv...
- [1 files][278.7 MiB/278.7 MiB]                                                
Operation completed over 1 objects/278.7 MiB.                                    


In [None]:
!gsutil cp gs://di_data_sas/EN/UK/Asda/Data/2020_periode_1/asda_ml_opinions.csv /tmp/asda_ml_opinions_topics.csv

Copying gs://di_data_sas/EN/UK/Asda/Data/2020_periode_1/asda_ml_opinions.csv...
/ [1 files][481.1 MiB/481.1 MiB]                                                
Operation completed over 1 objects/481.1 MiB.                                    


In [None]:
!gsutil cp gs://di_data_sas/EN/UK/Morrisons/Data/2020_periode_1/morrisons_ml_opinions.csv /tmp/morrisons_ml_opinions_topics.csv

Copying gs://di_data_sas/EN/UK/Morrisons/Data/2020_periode_1/morrisons_ml_opinions.csv...
- [1 files][ 17.8 MiB/ 17.8 MiB]                                                
Operation completed over 1 objects/17.8 MiB.                                     


In [None]:
!gsutil cp gs://di_data_sas/EN/UK/Ocado/Data/2020_periode_1/ocado_ml_opinions.csv  /tmp/ocado_ml_opinions_topics.csv

Copying gs://di_data_sas/EN/UK/Ocado/Data/2020_periode_1/ocado_ml_opinions.csv...
\ [1 files][214.1 MiB/214.1 MiB]                                                
Operation completed over 1 objects/214.1 MiB.                                    


### <font color='green*'>Setting the data types to optimize memory usage</font> 

In [2]:
edited_types = {
'asin':         'object',
'average'  :        'float16',
'review_body'     :  'object',
'review_date'      : 'object',
'review_likes'     : 'object',
'review_rating'    :'float16',
'review_title'     : 'object',
'five_star'        :'float16',
'four_star'        :'float16',
'one_star'        :'float16',
'pp_date'          : 'object',
'three_star'       :'float16',
'two_star'         :'float16',
'refpe'            : 'object',
'text_clean'        :'object',
'title_clean'       :'object',
'ml_score'         :'float16',
'text'             : 'object',
'ml_topic'         : 'category',
'opinion'          : 'object',
}


### <font color='green'>loading data into pandas dataframe</font> 

In [3]:
import pandas as pnd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
list_retailers = ['Amazon' , 'Asda', 'Morrisons', 'Ocado', 'Target', 'Walmart']
retailers = {}
for retailer in list_retailers:

    retailers[retailer] = pnd.read_csv('/tmp/' + retailer.lower() + 
                                      '_ml_opinions_topics.csv', dtype=edited_types, nrows=20000)
    #retailers[retailer] = retailers[retailer].dropna(subset=['review_body'])
    
#to_concat = [retailers[retailer][['text_clean', 'title_clean']] for retailer in list_retailers]
to_concat = [retailers[retailer]['review_body'] for retailer in list_retailers]
data = pnd.concat(to_concat, ignore_index = True)

In [None]:
import pickle
topics =  pickle.load(open('topics.p', 'rb'), encoding='latin1')
targets =  pickle.load(open('targets.p', 'rb'), encoding='latin1')

In [None]:
topics[list(topics.keys())[0]]
targets
topics['competition']

['allergens', 'competition', 'delivery', 'packaging', 'price', 'taste']

In [4]:
%%HTML
<style type="text/css">
table.dataframe morrisons, table.dataframe morrisons {
    border: 1px  black solid !important;
  color: black !important;
}

In [None]:
pnd.options.display.max_rows = 65
float_type = type(morrisons.review_rating[0])
def background_color(val):
    if val == object:
        color = 'yellow'
    elif val == float_type:
        color = 'pink'
    elif val != object:
        color = 'crimson'
    else: color = 'orange'
    return 'background-color: {}'.format(color)
      
morrisons.head().style.applymap(lambda x: 'background-color: green' if x == float else 'background-color: crimson; color : white ;font-size:150%')




Unnamed: 0,review_body,pp_date,review_date,asin,review_rating,review_title,refpe,text_clean,title_clean,ml_score,text,ml_topic,review,opinion
0,there are amazing!,2020-01-31,2020-01-02,372074011,5.0,"unusual, tasty and always on offer!",1826001062136,amazing,unusual tasty always offer,1.0,amazing unusual tasty always offer,['taste'],"there are amazing! unusual, tasty and always on offer!",[]
1,just fantastic flavour and well worth the money.,2020-01-31,2019-12-17,372074011,5.0,fantastic,1826001062136,fantastic flavour worth money,fantastic,1.0,fantastic flavour worth money fantastic,"['price', 'taste']",just fantastic flavour and well worth the money. fantastic,['just fantastic flavour']
2,the holes are too big!,2020-01-31,2019-12-02,292889011,2.0,the holes are too big!,1826000032459,holes big,holes big,-1.0,holes big holes big,[],the holes are too big! the holes are too big!,[]
3,"so disappointing! bland, bland, bland! where's the rhubarb and gin! rubbish!",2020-01-31,2020-01-06,450481011,1.0,no flavour!,1826001616475,disappointing bland bland bland rhubarb gin rubbish,flavour,-1.0,disappointing bland bland bland rhubarb gin rubbish flavour,['taste'],"so disappointing! bland, bland, bland! where's the rhubarb and gin! rubbish! no flavour!",[]
4,no taste at all do not waste money on this should be removed from sale,2020-01-31,2020-01-04,450481011,1.0,no taste,1826001616475,taste not waste money removed sale,taste,-1.0,taste not waste money removed sale taste,"['price', 'taste']",no taste at all do not waste money on this should be removed from sale no taste,[]


In [None]:
morrisons = pnd.read_csv('/tmp/morrisons_ml_opinions_topics.csv', dtype=edited_types, nrows=90000)

In [None]:
morrisons.head()

Unnamed: 0,review_body,pp_date,review_date,asin,review_rating,review_title,refpe,text_clean,title_clean,ml_score,text,ml_topic,review,opinion
0,there are amazing!,2020-01-31,2020-01-02,372074011,5.0,"unusual, tasty and always on offer!",1826001062136,amazing,unusual tasty always offer,1.0,amazing unusual tasty always offer,['taste'],"there are amazing! unusual, tasty and always o...",[]
1,just fantastic flavour and well worth the money.,2020-01-31,2019-12-17,372074011,5.0,fantastic,1826001062136,fantastic flavour worth money,fantastic,1.0,fantastic flavour worth money fantastic,"['price', 'taste']",just fantastic flavour and well worth the mone...,['just fantastic flavour']
2,the holes are too big!,2020-01-31,2019-12-02,292889011,2.0,the holes are too big!,1826000032459,holes big,holes big,-1.0,holes big holes big,[],the holes are too big! the holes are too big!,[]
3,"so disappointing! bland, bland, bland! where'...",2020-01-31,2020-01-06,450481011,1.0,no flavour!,1826001616475,disappointing bland bland bland rhubarb gin ru...,flavour,-1.0,disappointing bland bland bland rhubarb gin ru...,['taste'],"so disappointing! bland, bland, bland! where'...",[]
4,no taste at all do not waste money on this sho...,2020-01-31,2020-01-04,450481011,1.0,no taste,1826001616475,taste not waste money removed sale,taste,-1.0,taste not waste money removed sale taste,"['price', 'taste']",no taste at all do not waste money on this sho...,[]


### <font color='green'>Topic modeleing</font> 

### <font color='green'>LDA</font> 

In [None]:
from gensim.models import LdaModel
from gensim import corpora
import nltk
from string import punctuation
from nltk.tokenize import TreebankWordTokenizer

In [None]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
to_be_removed = list(en_stop) + list(punctuation)

tok = TreebankWordTokenizer()
# Tokenizing + removing stopwords
text_data = list(morrisons.dropna().text_clean.drop_duplicates().apply(lambda x: list(filter(lambda a: a.lower() not in to_be_removed,tok.tokenize(x)))).array)
print(text_data[0])
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

ldamodel = LdaModel(corpus, id2word=dictionary, num_topics=3)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['amazing']


  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [None]:
!pip install pyLDAvis


Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 5.7MB/s 
Collecting funcy
  Downloading https://files.pythonhosted.org/packages/66/89/479de0afbbfb98d1c4b887936808764627300208bb771fcd823403645a36/funcy-1.15-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97712 sha256=77af88a1577a2805be4ad3d0c99d3e243a5f06565c356d95eb34ed1e0567aa11
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.15 pyLDAvis-2.1.2


### <font color='green'>Displaying the topics</font> 

In [None]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(lda_display)

### <font color='green'>RAKE</font> 