## Notebook tasks for <b>cleaning</b> and <b>exploratory data analysis</b> for our thumbnail images:
### - Creates categorical value for Celebrity or Not
### - Concatenates all Labels
### - Concatenates all Text
### - Does LDA and divides into 14 topics
#### 00 = every day lifestyle
#### 01 = face shot/closeup
#### 02 = floral
#### 03 = women's daily fashion "swimwear" "boots" "heel"
#### 04 = speakers before a crowd, formal events
#### 05 = interior design, lifestyle
#### 06 = outdoors, scenic views
#### 07 = cosmetics and glam "lipstick", "dress", "mascara", "music", "hat", "instrument"
#### 08 = red carpet premier, "drink", "beverage"
#### 09 = rugged lifestyle "road", "sign", "dirt", "plaid", "gravel"
#### 10 = cuisine, food, fine dining
#### 11 = mixed media "poster" "collage" "paper" "flier"
#### 12 = life luxury milestones/feminine lense "hair" "afro" "newborn" "bride" "underwear" "lingerie" "linen" "kid" "bling" "ring" "yacht"
#### 13 = artistic form "dance" "pose" "tango" "paint"
#### 14 = sports and fitness
#### left out dummie variable: there are images that have no labels.
### - Trains model so that other pictures can be put into these topics
### - Celebrity column T/F
### - Text column T/F

Brought to you by Natalie Olivo
<a href = https://www.linkedin.com/in/natalie-olivo-82548951/>LinkedIn</a>
<a href = https://nmolivo.github.io/NMOstatic/>Website</a>
<a href = https://medium.com/@NatalieOlivo>Blog</a>
<a href = https://github.com/nmolivo>GitHub</a>

In [1]:
import pandas as pd

In [2]:
thumb_imgs_long = pd.read_csv("Project_TrackMaven/assets/thumbnail_link_long.csv")

In [3]:
thumb_imgs_long.head()

Unnamed: 0.1,Unnamed: 0,img,type,label,confidence
0,0,img_00000,Label,Human,98.962936
1,1,img_00000,Label,People,98.962921
2,2,img_00000,Label,Person,98.962936
3,3,img_00000,Label,Clothing,94.796982
4,4,img_00000,Label,Coat,94.796982


In [4]:
df = pd.read_csv("Project_TrackMaven/gitignore/newtweets_10percent.csv")

In [5]:
mapping_dict = pd.read_csv("Project_TrackMaven/assets/mapping_dict_thumbnail.csv")

In [6]:
mapping_dict.columns = [["img", "link_thumbnail"]]

In [7]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'brand', 'engagement',
       'has_spend', 'id', 'impact', 'share_token', 'timestamp', 'type', 'urls',
       'favorite_count', 'hashtags', 'link_description', 'link_thumbnail',
       'link_title', 'link_url', 'links', 'media_url', 'retweet_count', 'text',
       'tweet_id', 'tweet_url',
       'channel_info_{'type': ['link'], 'channel': 'twitter'}',
       'channel_info_{'type': ['photo', 'link'], 'channel': 'twitter'}',
       'channel_info_{'type': ['photo'], 'channel': 'twitter'}',
       'channel_info_{'type': ['text only'], 'channel': 'twitter'}'],
      dtype='object')

In [8]:
df = df[["brand", "engagement", "impact", "timestamp", "favorite_count", "hashtags", "retweet_count", "link_thumbnail"]]

In [9]:
df.head()

Unnamed: 0,brand,engagement,impact,timestamp,favorite_count,hashtags,retweet_count,link_thumbnail
0,137326,316,2.613319,2017-11-16 02:53:24,269,,47,https://assets.vogue.com/photos/5a0ca9458c64c4...
1,137329,82,2.000099,2017-11-16 02:23:40,54,,28,https://media.wmagazine.com/photos/59dd2ae5a46...
2,137322,59,0.26398,2017-11-16 02:00:19,48,,11,https://assets.teenvogue.com/photos/5a0b79da1d...
3,137325,102,0.831855,2017-11-16 00:20:05,76,,26,https://media.vanityfair.com/photos/5a0cb19d70...
4,137329,21,0.540106,2017-11-16 00:00:30,16,,5,https://media.wmagazine.com/photos/5a0a7396e61...


In [10]:
thumb = thumb_imgs_long.drop("Unnamed: 0", 1)

In [11]:
thumb.head()

Unnamed: 0,img,type,label,confidence
0,img_00000,Label,Human,98.962936
1,img_00000,Label,People,98.962921
2,img_00000,Label,Person,98.962936
3,img_00000,Label,Clothing,94.796982
4,img_00000,Label,Coat,94.796982


In [12]:
# for-loop to drop all "Human", "People", "Person" Label rows where the image 
# contains a celebrity.
for img in mapping_dict["img"]:
    if len(thumb.loc[(thumb["img"] == img) & (thumb["type"]=="Celebrity")])>0:
        thumb = thumb.loc[~((thumb['img'] == img) 
                          & (thumb['label'].isin(['Human', 'People', 'Person'])))]

In [13]:
# for-loop to drop all "Label" rows below 90% confidence if there is a celebrity
for img in mapping_dict["img"]:
    if len(thumb.loc[(thumb["img"] == img) & (thumb["type"]=="Celebrity")])>0:
        thumb = thumb.loc[~((thumb['img'] == img) 
                          & (thumb['type'].isin(['Label'])) & (thumb['confidence']<90))]

In [14]:
# for loop to drop all "Label", "Sticker", "Text" label rows where image contains text.
for img in mapping_dict["img"]:
    if len(thumb.loc[(thumb["img"] == img) & (thumb["type"]=="Text")])>0:
        thumb = thumb.loc[~((thumb['img'] == img) 
                          & (thumb['label'].isin(['Label', 'Sticker', 'Text'])))]

In [15]:
import numpy as np

In [17]:
thumb.head(10)

Unnamed: 0,img,type,label,confidence
3,img_00000,Label,Clothing,94.796982
4,img_00000,Label,Coat,94.796982
5,img_00000,Label,Overcoat,94.796982
6,img_00000,Label,Suit,94.796982
9,img_00000,Celebrity,Leonardo DiCaprio,99.999084
13,img_00001,Celebrity,Bernardino Luini,99.999336
17,img_00002,Text,are sure?,89.819656
18,img_00002,Text,you,97.977188
19,img_00002,Text,are,87.376259
20,img_00002,Text,you,97.977188


In [49]:
thumb_new = []
for img in thumb['img'].unique():
    img_dict = {'img': img}
    if len(thumb[(thumb['img']==img) & (thumb['type']=='Label')])>0:
        img_dict['label'] = ' '.join(thumb.loc[(thumb['img']==img) & (thumb['type']=='Label'), 'label'].tolist())
    else:
        img_dict['label'] = None
    if len(thumb[(thumb['img']==img) & (thumb['type']=='Text')])>0:
        text = [str(detected_text) 
                for detected_text in thumb.loc[(thumb['img']==img) & (thumb['type']=='Text'), 'label'].tolist()]
        img_dict['text'] = ' '.join(text)
    else:
        img_dict['text'] = None
    img_dict['celebrity'] = len(thumb[(thumb['img']==img) & (thumb['type']=='Celebrity')])>0
    thumb_new.append(img_dict)
thumb_new_df = pd.DataFrame(thumb_new)

In [50]:
thumb_new_df["text"] = [False if x == None else True for x in thumb_new_df["text"]]

In [51]:
thumb_new_df.to_csv("thumb_text_data.csv")

In [52]:
thumb_new_df.head()

Unnamed: 0,celebrity,img,label,text
0,True,img_00000,Clothing Coat Overcoat Suit,False
1,True,img_00001,,False
2,False,img_00002,,True
3,True,img_00003,,False
4,True,img_00004,,True


In [53]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim

In [54]:
tokenizer = RegexpTokenizer(r'\w+')

In [55]:
# create English stop words list
en_stop = get_stop_words('en')

In [56]:
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()  

In [57]:
doc_set = thumb_new_df.loc[:,["img", "label"]]

In [58]:
# compile sample documents into a list
doc_set.dropna(inplace=True)

In [59]:
texts = []

# loop through document list
for i in doc_set.label:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [60]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

In [61]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [78]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=14, id2word = dictionary, passes=20)

In [79]:
print(ldamodel.print_topics(num_topics=14, num_words=5))

[(0, '0.129*"cloth" + 0.115*"coat" + 0.095*"overcoat" + 0.093*"suit" + 0.074*"crowd"'), (1, '0.227*"plant" + 0.132*"flora" + 0.098*"jar" + 0.098*"pot" + 0.098*"potteri"'), (2, '0.178*"red" + 0.175*"carpet" + 0.170*"premier" + 0.041*"drink" + 0.040*"beverag"'), (3, '0.111*"outdoor" + 0.081*"natur" + 0.065*"water" + 0.047*"build" + 0.045*"hous"'), (4, '0.299*"person" + 0.299*"human" + 0.292*"peopl" + 0.029*"face" + 0.023*"portrait"'), (5, '0.095*"hair" + 0.065*"afro" + 0.065*"hairstyl" + 0.056*"logo" + 0.056*"trademark"'), (6, '0.135*"poster" + 0.078*"collag" + 0.069*"paper" + 0.061*"glass" + 0.048*"flyer"'), (7, '0.159*"furnitur" + 0.081*"chair" + 0.074*"build" + 0.062*"couch" + 0.061*"room"'), (8, '0.100*"room" + 0.066*"indoor" + 0.064*"furnitur" + 0.060*"design" + 0.060*"vehicl"'), (9, '0.177*"food" + 0.060*"cup" + 0.048*"meal" + 0.048*"dessert" + 0.047*"produc"'), (10, '0.099*"cloth" + 0.088*"cosmet" + 0.078*"music" + 0.061*"hat" + 0.041*"activ"'), (11, '0.103*"danc" + 0.080*"activ" 

In [80]:
#this line is commented out so we don't re-save over our LDA model on images.
#ldamodel.save('labels_lda_14.model')

In [81]:
doc_set.reset_index(inplace=True)
doc_label_topic = []
for i, text in enumerate(corpus):
    topics = sorted(ldamodel[text], key=lambda x: -x[1])
    doc_label_topic.append({'img': doc_set['img'][i], 'label_topic': topics[0][0], 'label_topic_prob': topics[0][1]})
doc_label_topic_df = pd.DataFrame(doc_label_topic)

In [82]:
ldamodel[corpus[0]]

[(0, 0.81428565415081477),
 (1, 0.014285714285714429),
 (2, 0.014285714285714674),
 (3, 0.014285714285714525),
 (4, 0.014285714303874215),
 (5, 0.014285719741457294),
 (6, 0.014285714285714719),
 (7, 0.014285714285714691),
 (8, 0.014285714285714507),
 (9, 0.014285714285714729),
 (10, 0.014285736453065155),
 (11, 0.014285730740129772),
 (12, 0.014285714285715321),
 (13, 0.014285730324940703)]

In [83]:
doc_label_topic_df.head()

Unnamed: 0,img,label_topic,label_topic_prob
0,img_00000,0,0.814286
1,img_00006,4,0.438776
2,img_00007,0,0.767857
3,img_00009,11,0.517857
4,img_00012,2,0.55195


In [84]:
thumb_new_df = thumb_new_df.merge(doc_label_topic_df, on='img', how='left')
thumb_new_df.head()

Unnamed: 0,celebrity,img,label,text,label_topic_x,label_topic_prob_x,0.0,1.0,2.0,3.0,...,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,label_topic_y,label_topic_prob_y
0,True,img_00000,Clothing Coat Overcoat Suit,False,13.0,0.814286,0,0,0,0,...,0,0,0,0,0,0,0,1,0.0,0.814286
1,True,img_00001,,False,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
2,False,img_00002,,True,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
3,True,img_00003,,False,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
4,True,img_00004,,True,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,


In [85]:
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [86]:
pyLDAvis.display(vis)

In [71]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary, passes=20)

In [72]:
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [87]:
#we picked 14 topics. 
thumb_new_df.head()

Unnamed: 0,celebrity,img,label,text,label_topic_x,label_topic_prob_x,0.0,1.0,2.0,3.0,...,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,label_topic_y,label_topic_prob_y
0,True,img_00000,Clothing Coat Overcoat Suit,False,13.0,0.814286,0,0,0,0,...,0,0,0,0,0,0,0,1,0.0,0.814286
1,True,img_00001,,False,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
2,False,img_00002,,True,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
3,True,img_00003,,False,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
4,True,img_00004,,True,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,


In [88]:
thumb_new_df.head(10)

Unnamed: 0,celebrity,img,label,text,label_topic_x,label_topic_prob_x,0.0,1.0,2.0,3.0,...,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,label_topic_y,label_topic_prob_y
0,True,img_00000,Clothing Coat Overcoat Suit,False,13.0,0.814286,0,0,0,0,...,0,0,0,0,0,0,0,1,0.0,0.814286
1,True,img_00001,,False,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
2,False,img_00002,,True,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
3,True,img_00003,,False,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
4,True,img_00004,,True,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
5,True,img_00005,,True,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,
6,False,img_00006,Human People Person Coffee Cup Cup,True,5.0,0.438776,0,0,0,0,...,0,0,0,0,0,0,0,0,4.0,0.438776
7,True,img_00007,Audience Crowd Speech,False,6.0,0.767857,0,0,0,0,...,1,0,0,0,0,0,0,0,0.0,0.767857
8,True,img_00009,Dye Paint Container,False,6.0,0.506547,0,0,0,0,...,1,0,0,0,0,0,0,0,11.0,0.517857
9,True,img_00010,,False,,,0,0,0,0,...,0,0,0,0,0,0,0,0,,


In [89]:
thumb_new_df = thumb_new_df.join(pd.get_dummies(thumb_new_df["label_topic"]))

KeyError: 'label_topic'

In [76]:
image_df = thumb_new_df.drop(["label", "label_topic"], axis =1)

In [90]:
image_df = image_df.drop("label_topic_prob", axis=1)

In [91]:
image_df.to_csv("thumb_text_data.csv")

In [92]:
image_df

Unnamed: 0,celebrity,img,text,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0
0,True,img_00000,False,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,True,img_00001,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,False,img_00002,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,True,img_00003,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,True,img_00004,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,True,img_00005,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,False,img_00006,True,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7,True,img_00007,False,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8,True,img_00009,False,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,True,img_00010,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0
