## Notebook tasks for <b>cleaning</b> and <b>exploratory data analysis</b> for our thumbnail images:
### - Creates categorical value for Celebrity or Not
### - Concatenates all Labels
### - Concatenates all Text
### - Does LDA and divides into 14 topics
#### 00 = every day lifestyle
#### 01 = face shot/closeup
#### 02 = floral
#### 03 = women's daily fashion "swimwear" "boots" "heel"
#### 04 = speakers before a crowd, formal events
#### 05 = interior design, lifestyle
#### 06 = outdoors, scenic views
#### 07 = cosmetics and glam "lipstick", "dress", "mascara", "music", "hat", "instrument"
#### 08 = red carpet premier, "drink", "beverage"
#### 09 = rugged lifestyle "road", "sign", "dirt", "plaid", "gravel"
#### 10 = cuisine, food, fine dining
#### 11 = mixed media "poster" "collage" "paper" "flier"
#### 12 = life luxury milestones/feminine lense "hair" "afro" "newborn" "bride" "underwear" "lingerie" "linen" "kid" "bling" "ring" "yacht"
#### 13 = artistic form "dance" "pose" "tango" "paint"
#### 14 = sports and fitness
#### left out dummie variable: there are images that have no labels.
### - Trains model so that other pictures can be put into these topics
### - Celebrity column T/F
### - Text column T/F

Brought to you by Natalie Olivo
<a href = https://www.linkedin.com/in/natalie-olivo-82548951/>LinkedIn</a>
<a href = https://nmolivo.github.io/NMOstatic/>Website</a>
<a href = https://medium.com/@NatalieOlivo>Blog</a>
<a href = https://github.com/nmolivo>GitHub</a>

In [1]:
import pandas as pd

In [5]:
media_imgs_long = pd.read_csv("../assets/media_url_link_long.csv")

In [9]:
media_imgs_long.drop("Unnamed: 0", axis =1, inplace = True)

In [13]:
# for-loop to drop all "Human", "People", "Person" Label rows where the image 
# contains a celebrity.
for img in media_imgs_long["img"]:
    if len(media_imgs_long.loc[(media_imgs_long["img"] == img) & (media_imgs_long["type"]=="Celebrity")])>0:
        media_imgs_long = media_imgs_long.loc[~((media_imgs_long['img'] == img) 
                          & (media_imgs_long['label'].isin(['Human', 'People', 'Person'])))]

In [15]:
# for-loop to drop all "Label" rows below 90% confidence if there is a celebrity
for img in media_imgs_long["img"]:
    if len(media_imgs_long.loc[(media_imgs_long["img"] == img) & (media_imgs_long["type"]=="Celebrity")])>0:
        media_imgs_long = media_imgs_long.loc[~((media_imgs_long['img'] == img) 
                          & (media_imgs_long['type'].isin(['Label'])) & (media_imgs_long['confidence']<90))]

In [16]:
# for loop to drop all "Label", "Sticker", "Text" label rows where image contains text.
for img in media_imgs_long["img"]:
    if len(media_imgs_long.loc[(media_imgs_long["img"] == img) & (media_imgs_long["type"]=="Text")])>0:
        media_imgs_long = media_imgs_long.loc[~((media_imgs_long['img'] == img) 
                          & (media_imgs_long['label'].isin(['Label', 'Sticker', 'Text'])))]

In [17]:
import numpy as np

In [19]:
media_new = []
for img in media_imgs_long['img'].unique():
    img_dict = {'img': img}
    if len(media_imgs_long[(media_imgs_long['img']==img) & (media_imgs_long['type']=='Label')])>0:
        img_dict['label'] = ' '.join(media_imgs_long.loc[(media_imgs_long['img']==img) & (media_imgs_long['type']=='Label'), 'label'].tolist())
    else:
        img_dict['label'] = None
    if len(media_imgs_long[(media_imgs_long['img']==img) & (media_imgs_long['type']=='Text')])>0:
        text = [str(detected_text) 
                for detected_text in media_imgs_long.loc[(media_imgs_long['img']==img) & (media_imgs_long['type']=='Text'), 'label'].tolist()]
        img_dict['text'] = ' '.join(text)
    else:
        img_dict['text'] = None
    img_dict['celebrity'] = len(media_imgs_long[(media_imgs_long['img']==img) & (media_imgs_long['type']=='Celebrity')])>0
    media_new.append(img_dict)
media_new_df = pd.DataFrame(media_new)

In [21]:
media_new_df

Unnamed: 0,celebrity,img,label,text
0,False,img_00000,Human People Person,
1,True,img_00001,,I 1L A I 1L A
2,True,img_00005,,AIR te AIR te
3,False,img_00008,Human People Person Lip Mouth,
4,False,img_00010,Cosmetics Mascara,tarte fed to tarte fed to
5,False,img_00011,Food Leisure Activities Meal Picnic Conifer Fl...,
6,False,img_00012,Aircraft Airplane Jet Transportation,heD jetB e York 586 jetB heD e York 586
7,False,img_00014,Promontory Cliff Outdoors,
8,False,img_00016,Sailboat Transportation Vessel Watercraft Boat...,
9,False,img_00017,Afro Hairstyle Hair,c c


In [22]:
media_new_df["text"] = [False if x == None else True for x in media_new_df["text"]]

In [23]:
media_new_df.to_csv("media_text_data.csv")

In [24]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim

Using Theano backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [25]:
tokenizer = RegexpTokenizer(r'\w+')

In [26]:
# create English stop words list
en_stop = get_stop_words('en')

In [27]:
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()  

In [28]:
doc_set = media_new_df.loc[:,["img", "label"]]

In [29]:
# compile sample documents into a list
doc_set.dropna(inplace=True)

In [30]:
texts = []

# loop through document list
for i in doc_set.label:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [31]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

In [32]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [35]:
ldamodel = LdaModel.load(‘labels_lda_14’, mmap='r')

SyntaxError: invalid character in identifier (<ipython-input-35-860abcc35ef4>, line 1)

In [None]:
doc_label_topic_new = []
for for thumb_labels in thumbnail:
    topics = topics = sorted(ldamodel[dictionary.doc2bow(thumb_labels)], key=lambda x: -x[1])
    doc_label_topic_new.append({'img': doc_set['img'][i], 'label_topic': topics[0][0], 'label_topic_prob': topics[0][1]})
doc_label_topic_new_df = pd.DataFrame(doc_label_topic)

In [None]:
doc_set.reset_index(inplace=True)
doc_label_topic = []
for i, text in enumerate(corpus):
    topics = sorted(ldamodel[text], key=lambda x: -x[1])
    doc_label_topic.append({'img': doc_set['img'][i], 'label_topic': topics[0][0], 'label_topic_prob': topics[0][1]})
doc_label_topic_df = pd.DataFrame(doc_label_topic)

In [None]:
media_new_df = thumb_new_df.merge(doc_label_topic_df, on='img', how='left')
media_new_df.head()