In [216]:
import json
import spacy
import unicodedata
from collections import Counter
from html.parser import HTMLParser

In [217]:
nlp = spacy.load('en_core_web_sm')

In [218]:
fashion_path = "../dataset/json_files/fashion_.json"
furniture_path = "../dataset/json_files/furniture_.json"
wearable_tech_path = "../dataset/json_files/wearable_tech.json"

In [219]:
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

In [250]:
class Parser(object):
    def __init__(self, path, out_path):
        self.path = path
        self.out_path = out_path
        self.stripper = MLStripper()
        self.data = self.open_json(path)
        
    def open_json(self, path):
        f = open(path) 
        data = json.load(f) 
        f.close()
        return data 
    
    def clean_html(self, text):
        html = ' '.join(text)
        self.stripper.feed(html)
        data = self.stripper.get_data()
        data = unicodedata.normalize("NFKD", data)
        self.stripper.fed = []
        return data
    
    def clean_images(self, images):
        ims = []
        for i in range(len(images)):
            ims.append(images[i]['path'])
        return ims
    
    def clean_text(self, text):
        """
        Input: text is a list with sentences (the raw text from webscraping). 
        Returns: a string with the text
        """
        all_text = ""
        t = all_text.join(text)
        return t
    
    def get_quotes(self, text):
        """
        Input:    cleaned text as string
        Returns:  two lists containing text in quotes and text not in quotes 
        """
        data = text
        quotes = []
        no_quotes = []
        has_quote = data.find('"')
        Q = False # flag if there is no quotes, return the text as is 

        while has_quote != -1:
            Q = True

            # append the text so far until quote
            no_quotes.append(data[:has_quote])

            # chop off text 
            data = data[has_quote+1:]
            next_quote = data.find('"')

            if next_quote != -1:
                quotes.append(data[:next_quote])
            else:
                break

            data = data[next_quote+1:]
            has_quote = data.find('"')

        if Q == False:
            no_quotes.append(text)
        else:
            no_quotes.append(data)

        return quotes, no_quotes
    
    def extract_adjectives(self, doc):
        adj_l = []
        for token in doc:
            pos = token.pos_
            if pos == 'ADJ':
                adj = token.text
                adj_lower = adj.lower()
                adj_l.append(adj_lower)
        return adj_l 
    
    def has_text(self, example):
        if example['text'] == []:
            return False
        return True
    
    def has_images(self, example):
        if example['images'] == []:
            return False
        return True

    def parse_example(self, list_of_sentences):
        """
        Input: list with strings
        Returns: adjectives
        """
        adjectives = []
        for sent in list_of_sentences:
            doc = nlp(sent)
            list_of_adj = self.extract_adjectives(doc)
            adjectives.extend(list_of_adj)
        return adjectives
    
    def save_json(self, file_path, data):
        out_file = open(file_path, "w")
        json.dump(data, out_file)
        out_file.close()
        
        
    def run(self):
        dataset = self.data
        new_data = []
        all_reporters_adjs = []
        all_authors_adjs = []
        for i in range(len(dataset)):
            example = dataset[i]

            text = example['text']
            images = example['images']
            title = example['title']
            date = example['date']

            if self.has_text(example) and self.has_images(example):
                cleaned_text = self.clean_html(text)
                cleaned_images = self.clean_images(images)
                quotes, no_quotes = self.get_quotes(cleaned_text)
                adj_quotes = self.parse_example(quotes)
                adj_no_quotes = self.parse_example(no_quotes)

                cleaned_example = dict()
                cleaned_example['text'] = cleaned_text
                cleaned_example['images'] = cleaned_images
                cleaned_example['title'] = title
                cleaned_example['date'] = date
                cleaned_example['quotes'] = quotes
                cleaned_example['no_quotes'] = no_quotes
                cleaned_example['adj_quotes'] = adj_quotes
                cleaned_example['adj_no_quotes'] = adj_no_quotes

                all_reporters_adjs.extend(adj_no_quotes)
                all_authors_adjs.extend(adj_quotes)

                new_data.append(cleaned_example)

        all_reporters_adj = Counter(all_reporters_adjs)
        all_authors_adj = Counter(all_authors_adjs)

        adjectives_stats = dict()
        adjectives_stats['reporters'] = all_reporters_adj
        adjectives_stats['authors'] = all_authors_adj
        new_data.append(adjectives_stats)

        self.save_json(self.out_path, new_data)

In [251]:
fashion_new_path = '/Users/manuelladron/Documents/_CARNEGIE MELLON UNIVERSITY/PHD-CD/PHD-CD Research/dataset/json_files/fashion_cleaned.json'
wearable_tech_new_path = '/Users/manuelladron/Documents/_CARNEGIE MELLON UNIVERSITY/PHD-CD/PHD-CD Research/dataset/json_files/wearable_tech_cleaned.json'
furniture_new_path = '/Users/manuelladron/Documents/_CARNEGIE MELLON UNIVERSITY/PHD-CD/PHD-CD Research/dataset/json_files/furniture_cleaned.json'


In [254]:
p = Parser(furniture_path, furniture_new_path)

In [255]:
p.run()