In [1]:
import json
import spacy
import unicodedata
from collections import Counter
from itertools import chain

In [33]:
nlp = spacy.load('en_core_web_sm')
nlp2 = spacy.load('en_core_web_sm')
merge_ents = nlp2.create_pipe("merge_entities")
nlp2.add_pipe(merge_ents)
#all_stop_words = nlp.Defaults.stop_words

In [61]:
furniture_path = "../json_files/cleaned/furniture_cleaned.json"
fashion_path = "../json_files/cleaned/fashion_cleaned.json"
wearable_tech = "../json_files/cleaned/wearable_tech_cleaned.json"

In [62]:
# outpaths 
furniture_tokenized = "../json_files/tokenized/furniture_tokenized.json"
furniture_contexts = "../json_files/context_words/furniture_context"

fashion_tokenized = "../json_files/tokenized/fashion_tokenized.json"
fashion_contexts = "../json_files/context_words/fashion_context"

wearable_tokenized = "../json_files/tokenized/wearable_tech_tokenized.json"
wearable_contexts = "../json_files/context_words/wearable_tech_context"

In [63]:
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

def save_json(file_path, data):
    out_file = open(file_path, "w")
    json.dump(data, out_file)
    out_file.close()

In [64]:
def word_tokenize(text):
    """
    Returns a list of a dictionary per word in the text.
    The dictionary contains the word, its tag and its POS. 
    """
    return [{'word': token.text, 'tag': token.tag_, 'pos': token.pos_} for token in nlp2(text) if not token.tag_ == '_SP']

def tokenize_json(json_file):
    new_data = []
    for i, article in enumerate(json_file):
        if i != len(json_file)-1:
            new_data.append(word_tokenize(article['text']))
    return new_data

In [65]:
def flatten_(l):
    res = []
    for elem in l:
        if isinstance(elem, list):
            for val in elem:
                res.append(val)
        else:
            res.append(elem)
    return res

def create_contexts(dataset, window_size, outpath_tokenization, outpath_context, save_tok=True):
    """
    Takes in the processed dataset containing {word, tag, POS}
    Returns a list with all context of window_size of adjectives 
    """
    all_contexts = []
    for article in dataset:                           # iterates over all articles 
        for i, word in enumerate(article):            # iterates over all words in each article
            if word['pos'] == 'ADJ':                  # selects each adj 
                context = []                          # list with context 
                if i <= window_size: 
                    context.append(article[:i])
                else:
                    context.append(article[i-window_size:i])
                
                context.append((word, 'root'))
                
                if i+window_size >= len(article) - 1:
                    context.append(article[i+1:])
                else:
                    context.append(article[i+1:window_size+i+1])
                    
                context = flatten_(context)
                all_contexts.append(context)
    if save_tok:
        save_json(outpath_tokenization, dataset)
    save_json(outpath_context, all_contexts)

def run(path, out_token, out_context):
    # 1) Open file
    json_file = open_json(path)
    
    # 2) Tokenize file 
    tokenized_file = tokenize_json(json_file)
    
    # 3) Get contexts and save 
    window_sizes = [2,3,5,7,9]
    for i, size in enumerate(window_sizes):
        if i==0:
            save_tok=True
        else:
            save_tok=False
        context_path = out_context + "_{}.json".format(size)
        create_contexts(tokenized_file, size, out_token, context_path, save_tok)

In [66]:
run(furniture_path, furniture_tokenized, furniture_contexts)

In [67]:
furniture_contexts = "../json_files/context_words/furniture_context_9.json"
a = open_json(furniture_contexts)

In [69]:
a[0]

[{'word': 'the', 'tag': 'DT', 'pos': 'DET'},
 {'word': 'VDF', 'tag': 'NNP', 'pos': 'PROPN'},
 {'word': 'x', 'tag': 'SYM', 'pos': 'SYM'},
 {'word': 'Sight', 'tag': 'NNP', 'pos': 'PROPN'},
 {'word': 'Unseen', 'tag': 'NNP', 'pos': 'PROPN'},
 {'word': 'collaboration', 'tag': 'NN', 'pos': 'NOUN'},
 {'word': ',', 'tag': ',', 'pos': 'PUNCT'},
 {'word': 'which', 'tag': 'WDT', 'pos': 'DET'},
 {'word': 'balances', 'tag': 'VBZ', 'pos': 'VERB'},
 [{'word': 'angular', 'tag': 'JJ', 'pos': 'ADJ'}, 'root'],
 {'word': ',', 'tag': ',', 'pos': 'PUNCT'},
 {'word': 'architectural', 'tag': 'JJ', 'pos': 'ADJ'},
 {'word': 'shapes', 'tag': 'NNS', 'pos': 'NOUN'},
 {'word': 'with', 'tag': 'IN', 'pos': 'ADP'},
 {'word': 'playful', 'tag': 'JJ', 'pos': 'ADJ'},
 {'word': ',', 'tag': ',', 'pos': 'PUNCT'},
 {'word': 'feminine', 'tag': 'JJ', 'pos': 'ADJ'},
 {'word': 'touches', 'tag': 'NNS', 'pos': 'NOUN'},
 {'word': '.', 'tag': '.', 'pos': 'PUNCT'}]