In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import pyLDAvis.gensim

Using Theano backend.


Couldn't import dot_parser, loading of dot files will not be possible.


### Media

In [3]:
med_a = pd.read_csv("LDA/01_media_text_data.csv")
med_b = pd.read_csv("LDA/01_media_cleanup_text_data.csv")

med_a = med_a.drop("Unnamed: 0", axis =1)
med_b = med_b.drop("Unnamed: 0", axis =1)

map_med_a = pd.read_csv("LDA/mapping_dict_mediaurl.csv")
map_med_b = pd.read_csv("LDA/mapping_dict_media_cleanup.csv")

map_med_a.columns=["img", "media_url"]
map_med_b.columns=["img", "med_cleanup_url"]

med_a = med_a.merge(map_med_a, on = "img")
med_b = med_b.merge(map_med_b, on = "img")

In [4]:
med_b.text = [True if len(x) > 0 else False for x in med_b.text]
med_b.columns = ["celebrity", "img", "label", "text", "media_url"]

all_media = med_a.append(med_b).drop_duplicates(subset="media_url")

all_media.columns = ["celebrity", "img", "label", "text", "url"]

all_media["img_class"] = "media"

### Thumbs

In [5]:
thumb_a = pd.read_csv("LDA/01_thumb_text_data.csv")
thumb_b = pd.read_csv("LDA/01_thumb_cleanup_text_data.csv")

thumb_a = thumb_a.drop("Unnamed: 0", axis =1)
thumb_b = thumb_b.drop("Unnamed: 0", axis =1)

thumb_b["text"] = thumb_b.apply(lambda x: x.isnull())

In [6]:
map_thumb_a = pd.read_csv("LDA/mapping_dict_thumbnail.csv")
map_thumb_b = pd.read_csv("LDA/mapping_dict_thumb_cleanup.csv")

map_thumb_a.columns=["img", "thumb_url"]
map_thumb_b.columns=["img", "thumb_url"]

thumb_a = thumb_a.merge(map_thumb_a, on = "img")
thumb_b = thumb_b.merge(map_thumb_b, on = "img")

all_thumb = thumb_a.append(thumb_b).drop_duplicates(subset="thumb_url")
all_thumb.columns = ["celebrity", "img", "label", "text", "url"]
all_thumb["img_class"] = "thumb"

### All Images

In [7]:
all_images = all_media.append(all_thumb)

In [8]:
all_images['label'].fillna('', inplace=True)
all_images['label'] = all_images.apply(lambda x: ' '.join([str(x['label']), 'Celebrity']) if x['celebrity'] 
                                                        else str(x['label']), axis=1)
all_images['label'] = all_images.apply(lambda x: ' '.join([str(x['label']), 'Text']) if x['text'] 
                                                        else str(x['label']), axis=1)

In [9]:
all_images.drop(["celebrity", "text"], axis = 1, inplace=True)

In [10]:
all_images.to_csv("LDA/01_lda_data.csv")

### LDA Model

In [11]:
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()  

In [12]:
doc_set = all_images.loc[:,["img", "label"]]
doc_set.dropna(inplace=True)
doc_set.drop_duplicates(inplace=True)

In [13]:
texts = []

# loop through document list
for i in doc_set.label:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [14]:
texts[:5]

[['celebr', 'text'],
 ['cosmet', 'mascara', 'text'],
 ['sailboat', 'transport', 'vessel', 'watercraft', 'boat', 'yacht', 'dinghi'],
 ['human',
  'peopl',
  'person',
  'face',
  'portrait',
  'blond',
  'femal',
  'girl',
  'woman',
  'text'],
 ['celebr', 'text']]

In [15]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
print(corpus[0])

[(0, 1), (1, 1)]


In [17]:
# Save
corpora.MmCorpus.serialize('LDA/label_corpus.mm', corpus)
dictionary.save('LDA/label_corpus.dict')

In [19]:
model = models.ldamodel.LdaModel(corpus, num_topics=15, id2word=dictionary, passes=20, random_state=128)
model.save('LDA/labels_lda_15.model')

### Load saved model, corpus, dictionary and visualize

In [2]:
dictionary = corpora.Dictionary.load('label_corpus.dict')
corpus = corpora.MmCorpus('label_corpus.mm')
model = models.ldamodel.LdaModel.load('labels_lda_15.model')

In [3]:
labels_vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
pyLDAvis.display(labels_vis)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
