In [76]:
import json
import spacy
import regex as re
import math
import numpy as np
import string
import unicodedata
from spacy.tokenizer import Tokenizer
from collections import Counter, defaultdict
from html.parser import HTMLParser
from IPython.display import clear_output

In [13]:
nlp = spacy.load('en_core_web_lg')

# To treat hyphenated word as tokens
infixes = tuple([r"'s\b", r"(?<!\d)\.(?!\d)"]) +  nlp.Defaults.prefixes
infix_re = spacy.util.compile_infix_regex(infixes)

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

nlp.tokenizer = custom_tokenizer(nlp)

In [54]:
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [2]:
# Path for file dset_dataloader.json
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

def save_json(file_path, data):
    out_file = open(file_path, "w")
    json.dump(data, out_file)
    out_file.close()

def flatten(S):
    if S == []:
        return S
    if isinstance(S[0], list):
        return flatten(S[0]) + flatten(S[1:])
    return S[:1] + flatten(S[1:])

In [21]:
image_sents_p = '../../json_files/cleaned/ADARI_v2/furniture/ADARI_furniture_sents.json'
vocab_p = '../../json_files/cleaned/ADARI_v2/furniture/ADARI_furniture_vocab.json'

In [22]:
image_sents = open_json(image_sents_p)
vocab = open_json(vocab_p)

### Document frequency

###### Number of documents containing each word 

In [72]:
def document_frequency(image_sents):
    """
    Takes in the image sents dictionary (image: (list of sentences,list of POS)) and returns for all words and for all adjs:
     - document frequency: number of documents containing each word (dictionary word: N) 
     - term frequency: term frequency of each word in each document divided by total words in each document
    """
    # document frequency
    df_all = defaultdict(int) 
    df_adjs = defaultdict(int)
    
    # term frequency
    tf_all = defaultdict(int)
    tf_adjs = defaultdict(int)
    
    ii = 0
    total_steps = len(image_sents)
    # For each entry (doc)
    for image_name, list_of_sents_pos in image_sents.items():
        update_progress(ii/total_steps)
        #if ii == 1: break
        text = '. '.join(list_of_sents_pos[0])
        doc = nlp(text)
        
        tokens = []
        token_adjs = []
        
        for token in doc:
            if token.is_stop == False and token.is_punct == False:
                tokens.append(str(token.lemma_))
                if token.pos_ == 'ADJ':
                    token_adjs.append(str(token.lemma_))
        
        # This chunk is for term frequency counts
        total_words_doc = len(tokens)
        total_adjs_doc = len(token_adjs)
        count_words = Counter(tokens)
        count_adjs = Counter(token_adjs)
        
        tf_all[image_name] = {word:count_words[word]/total_words_doc for word in count_words.keys()}
        tf_adjs[image_name] = {adj:count_adjs[adj]/total_adjs_doc for adj in count_adjs.keys()}
        
        # This chunk is for document frequency counts
        tokens_unique = list(set(tokens))
        token_adjs_unique = list(set(token_adjs))
        
        for token in tokens_unique:
            df_all[token] += 1 
        for token in token_adjs_unique:
            df_adjs[token] += 1
        
        ii += 1
        
    return df_all, df_adjs, tf_all, tf_adjs

In [73]:
df_all, df_adjs, tf_all, tf_adjs = document_frequency(image_sents)

Progress: [####################] 100.0%


In [86]:
def tf_idf(df_all, df_adjs, tf_all, tf_adjs):
    """
    df_all: dictionary word: number of documents containing that word (dictionary word: N)
    df_adjs: dictionary word: number of documents containing that adj (dictionary adj: N)
    tf_all: dictionary image_name: {word: word_counts/total words}
    tf_adjs: dictionary image_name: {adj: adj_counts/total adjs}
    
    Returns:
    tf-idf_all: dictionary with image_name: {word:tf-idf score}
    tf-idf_adj: dictionary with image_name: {adj: tf-idf score}
    """
    total_number_docs = len(tf_all.keys())
    tf_idf_all = defaultdict(float)
    tf_idf_adjs = defaultdict(float)
    
    # For all words
    ii = 0
    total_steps = len(tf_all.keys())
    for key, val in tf_all.items():
        update_progress(ii/total_steps)
        #if ii == 1: break
        # Each image name is going to have each label with the tf-idf value 
        tf_idf_all[key] = {w:val[w] * np.log((total_number_docs+1)/(df_all[w]+1)) +1 for w in val}
        ii += 1
    
    # For all adjs
    ii = 0
    for key, val in tf_adjs.items():
        update_progress(ii/total_steps)
        tf_idf_adjs[key] = {a:val[a] * np.log((total_number_docs+1)/(df_adjs[a]+1)) +1 for a in val}
        ii += 1
        
    return tf_idf_all, tf_idf_adjs

In [87]:
tf_idf_all, tf_idf_adjs = tf_idf(df_all, df_adjs, tf_all, tf_adjs)

Progress: [####################] 99.1%


In [90]:
a = tf_idf_adjs['62a2e48f4b0d47f7159c3d68a79da7551dda6615.jpg']

In [91]:
{k: v for k, v in sorted(a.items(), key=lambda item: item[1], reverse=True)}

{'well-priced': 1.4601795334745078,
 'modern': 1.437268881379797,
 'country-friendly': 1.279781542251678,
 'three-piece': 1.2202387457839898,
 'well-made': 1.2113859443097397,
 'cute': 1.1962245369147158,
 'mutual': 1.1781724147232506,
 'high-backed': 1.175097892794079,
 'spacious': 1.1734642239934738,
 'occasional': 1.136851968095442,
 'three-legged': 1.1317698303913002,
 'online': 1.1258884898495598,
 'certain': 1.1217065656943046,
 'useful': 1.1201441264330274,
 'contemporary': 1.1154466936543481,
 'practical': 1.108760730747139,
 'physical': 1.1054532275017581,
 'main': 1.0947655277308481,
 'international': 1.0864011442774633,
 'old': 1.0729663560424263,
 'circular': 1.0711460994741182,
 'low': 1.068873806001108,
 'simple': 1.0446401563072765,
 'small': 1.040189851657591}

In [93]:
save_file_path = '../../json_files/cleaned/ADARI_v2/furniture/'
save_json(save_file_path + 'ADARI_furniture_tfidf_words.json', tf_idf_all)
save_json(save_file_path + 'ADARI_furniture_tfidf_adjs.json', tf_idf_adjs)

In [98]:
def select_top_K_adjs_dict(tf_idf_adjs, K):
    tf_idf_K = dict()
    ii = 0
    for image_name, val in tf_idf_adjs.items():
        s = {k: v for k, v in sorted(val.items(), key=lambda item: item[1], reverse=True)}
        tf_idf_K[image_name] = [adj for adj in list(s.keys())[:K]]
        ii +=1
    return tf_idf_K

In [99]:
topK_tf_idf = select_top_K_adjs_dict(tf_idf_adjs, 3)

In [100]:
topK_tf_idf

{'62a2e48f4b0d47f7159c3d68a79da7551dda6615.jpg': ['well-priced',
  'modern',
  'country-friendly'],
 '3523c82bd34c61967adf37ec2e29d48a5e7ec8ac.jpg': ['well-priced',
  'modern',
  'country-friendly'],
 '3bd82bce2d8355cde7d3e1ccf5a65fcb2646ed9b.jpg': ['work-from-home',
  'glossy',
  'green-stained'],
 '85f5902ce0b71a59bdf0266972245961178177b9.jpg': ['work-from-home',
  'glossy',
  'green-stained'],
 '44103e41775ea9df62261028e397c0507c862094.jpg': ['work-from-home',
  'glossy',
  'green-stained'],
 '9e9de9bd9a69686d309dcd33b7fbd753789d9e4a.jpg': ['work-from-home',
  'glossy',
  'green-stained'],
 'c4cd583f469f212c36c9d5515c622f053aaab6d0.jpg': ['work-from-home',
  'glossy',
  'green-stained'],
 '54f06bffd316bf19db9de2a3ad3a3c1f2ee9147f.jpg': ['work-from-home',
  'glossy',
  'green-stained'],
 'aaeadb2ee7f9ddeb1961219db61dd7b30c882fdc.jpg': ['modulus',
  'fermented',
  'predominant'],
 '8b98cdc23ac5e9b3d0387eb8857ee88deaccf279.jpg': ['modulus',
  'fermented',
  'predominant'],
 'b8f756b4f6

In [101]:
save_json(save_file_path + 'ADARI_furniture_tfidf_top3adjs.json', topK_tf_idf)