In [None]:
import hashlib
import json
import random
from dataclasses import asdict

from xid import XID

from library.types import *
from library.utils import *

In [None]:
with open("artifacts/feedbacks/trustpilot.json", "r") as file:
    trustpilot_feedbacks = json.load(file)

with open("artifacts/feedbacks/app_store.json", "r") as file:
    app_store_feedbacks = json.load(file)

with open("artifacts/feedbacks/play_store.json", "r") as file:
    play_store_feedbacks = json.load(file)

with open("artifacts/feedbacks/amazon.json", "r") as file:
    amazon_feedbacks = json.load(file)

In [None]:
feedbacks = []

# TODO: Enhance this cleaning


def clean_content(title: str, body: str) -> str:
    title = title.strip()
    body = body.strip()

    if not title:
        return body

    if not body:
        return title

    if body.startswith(title.rstrip('!"#%&\'()*,./:;?@[\\]^_`{|}~\xA0¡¦§¨©ª«¬\xAD®¯²³´µ¶·¸¹º»¿‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁄⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁒⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞™')):
        return body

    return title + "\n" + body


for feedback in trustpilot_feedbacks:
    source = "TRUSTPILOT"

    customer = feedback["user_profile"]["name"] or "UNKNOWN"

    title = feedback["title"] or ""
    body = feedback["review_text"] or ""
    content = clean_content(title, body)

    hash = hashlib.sha256((source + customer + content).encode()).hexdigest()

    feedbacks.append(asdict(Feedback(
        id=XID().string(),
        hash=hash,
        context="",
        categories=[],
        source=source,
        customer=customer,
        content=content,
    )))

for feedback in app_store_feedbacks:
    source = "APP_STORE"

    customer = feedback["user_profile"]["profile_name"] or "UNKNOWN"

    title = feedback["title"] or ""
    body = feedback["review_text"] or ""
    content = clean_content(title, body) 

    hash = hashlib.sha256((source + customer + content).encode()).hexdigest()

    feedbacks.append(asdict(Feedback(
        id=XID().string(),
        hash=hash,
        context="",
        categories=[],
        source=source,
        customer=customer,
        content=content,
    )))

for feedback in play_store_feedbacks:
    source = "PLAY_STORE"

    customer = feedback["user_profile"]["profile_name"] or "UNKNOWN"

    title = feedback["title"] or ""
    body = feedback["review_text"] or ""
    content = clean_content(title, body)    

    hash = hashlib.sha256((source + customer + content).encode()).hexdigest()

    feedbacks.append(asdict(Feedback(
        id=XID().string(),
        hash=hash,
        context="",
        categories=[],
        source=source,
        customer=customer,
        content=content,
    )))

for feedback in amazon_feedbacks:
    source = "AMAZON"

    customer = feedback["user_profile"]["name"] or "UNKNOWN"

    title = feedback["title"] or ""
    body = feedback["review_text"] or ""
    content = clean_content(title, body)    

    hash = hashlib.sha256((source + customer + content).encode()).hexdigest()

    feedbacks.append(asdict(Feedback(
        id=XID().string(),
        hash=hash,
        context="",
        categories=[],
        source=source,
        customer=customer,
        content=content,
    )))

unique_hashes = set()
unique_feedbacks = list()
for feedback in feedbacks:
    if feedback["hash"] not in unique_hashes:
        unique_feedbacks.append(feedback)
        unique_hashes.add(feedback["hash"])

print(f"{len(feedbacks) - len(unique_feedbacks)} feedbacks duplicated")

feedbacks = unique_feedbacks

random.shuffle(feedbacks)

In [None]:
with open("artifacts/feedbacks/cleaned.json", "w") as file:
    json.dump(feedbacks, file, indent=2)