In [1]:
import pandas as pd
import ast
import openai
from transformers import GPT2Tokenizer
import json

import sys
sys.path.append('../backend')
from preprocessorV2 import *
from topic_modelling_LDA import fit_lda

import warnings
warnings.filterwarnings("ignore") 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/product_review.csv')[['data', 'nama_produk']]
df.tail()

Unnamed: 0,data,nama_produk
546,"{'review': {0: 'sesuai', 1: 'barang sesuai, pe...",MANS SEARCH FOR MEANING
547,{'review': {0: 'Barang diterima dalam kondisi ...,Knorr Chicken Seasoning Powder 200g Free Sendo...
548,{'review': {0: 'pesanan sampai kemasannya aman...,Twin Pack: Tropicana Slim Diabtx 50 Sch - Peng...
549,"{'review': {0: 'good seller, pengiriman cepat'...",TOLAK ANGIN PLUS MADU BOX 12 PCS
550,{'review': {0: 'Seneeeng banget akhirnya paket...,Paket Biskuit Slai Olai All Variant Free Tote Bag


# Data Cleaning

In [3]:
def preprocessor(data):
    data = clean_data(data)
    data = tokenizer(data)
    data = remove_stop_words(data)
    data = bigram(data)

    return data

def prepare_data(df):
    clean_data = []
    for i in range(len(df)):
        r = ast.literal_eval(df['data'][i])
        p = preprocessor(r['review'].values())
        pre = {}
        for j in range(len(p)):
            pre[str(j)] = p[j]
        r['preprocessed'] = pre

        data_fix = pd.DataFrame({"preprocessed" : r['preprocessed'].values(), "rating" : r['rating'].values()})
        n_rating = data_fix['rating'].value_counts().to_dict()

        topic = {}
        for k in range(1,6):
            try:
                data_train = data_fix[data_fix['rating'] == k]['preprocessed']
                lda, t = fit_lda(data_train)
                topic[k] = t
            except Exception as e:
                pass

        x = {}
        x['nama_produk'] = df['nama_produk'][i]
        x['jumlah_rating'] = n_rating
        x['topic'] = topic
        
        clean_data.append(x)
    return clean_data

def generate_prompt(data):
    texts = []

    for i in range(len(data)):
        nama_produk = data[i]['nama_produk']
        jumlah_rating = data[i]['jumlah_rating']
        topic = data[i]['topic']

        text = f"""nama {nama_produk}
        jumlah review per bintang (1-5) : {jumlah_rating}
        topik per bintang : {topic}

        buatkan 1 paragraf yang menggambarkan produk tersebut berdasarkan hasil topik dari model LDA. Topik tersebut dihasilkan dari analisis review-review yang diberikan oleh pelanggan di sebuah marketplace. Pertimbangkan bobot setiap bintang dilihat dari jumlah reviewnya. berikanlah pertimbangan atau kelebihan dan kekurangan dari produk ini. Anggap bahwa yang membaca paragraf ini adalah orang awam, dan berikan informasi yang relevan dan sesuai dengan topik tersebut. Tulislah dengan gaya yang kreatif dan mudah dipahami. tidak perlu menyebutkan angka agar memudahkan pembaca memahami."""
        texts.append(text)
    return texts

In [4]:
preprocessed_data = prepare_data(df)
df['preprocessed'] = preprocessed_data
preprocessed_data[:1]

[{'nama_produk': 'Waterspot Remover - Obat Anti Jamur Penghilang Noda Kerak Kaca Mobil - 250ML + Lap',
  'jumlah_rating': {5: 120, 4: 18, 3: 15, 1: 7, 2: 3},
  'topic': {1: 'kaca sprayer fungsi paket buruk spray kering dipake dikit utama',
   3: 'spray rusak semprotannya barang nyemprot botolnya cuci jamur dipake kaca',
   4: 'berfungsi semprotan dicoba proses botol bagus semprotannya sayang rusak coba',
   5: 'barang sesuai dicoba cepat semoga sesuai_pesanan seller coba kaca_kinclong packing'}}]

In [5]:
texts = generate_prompt(preprocessed_data)
df['text'] = texts
texts[:1]

["nama Waterspot Remover - Obat Anti Jamur Penghilang Noda Kerak Kaca Mobil - 250ML + Lap\n        jumlah review per bintang (1-5) : {5: 120, 4: 18, 3: 15, 1: 7, 2: 3}\n        topik per bintang : {1: 'kaca sprayer fungsi paket buruk spray kering dipake dikit utama', 3: 'spray rusak semprotannya barang nyemprot botolnya cuci jamur dipake kaca', 4: 'berfungsi semprotan dicoba proses botol bagus semprotannya sayang rusak coba', 5: 'barang sesuai dicoba cepat semoga sesuai_pesanan seller coba kaca_kinclong packing'}\n\n        buatkan 1 paragraf yang menggambarkan produk tersebut berdasarkan hasil topik dari model LDA. Topik tersebut dihasilkan dari analisis review-review yang diberikan oleh pelanggan di sebuah marketplace. Pertimbangkan bobot setiap bintang dilihat dari jumlah reviewnya. berikanlah pertimbangan atau kelebihan dan kekurangan dari produk ini. Anggap bahwa yang membaca paragraf ini adalah orang awam, dan berikan informasi yang relevan dan sesuai dengan topik tersebut. Tulis

In [6]:
df.head()

Unnamed: 0,data,nama_produk,preprocessed,text
0,"{'review': {0: 'spray tdk bisa digunakan,caira...",Waterspot Remover - Obat Anti Jamur Penghilang...,{'nama_produk': 'Waterspot Remover - Obat Anti...,nama Waterspot Remover - Obat Anti Jamur Pengh...
1,"{'review': {0: 'barang cepat diproses, selamat...","NPURE NONI PROBIOTICS ""BALANCE ME"" FINE TONER","{'nama_produk': 'NPURE NONI PROBIOTICS ""BALANC...","nama NPURE NONI PROBIOTICS ""BALANCE ME"" FINE T..."
2,{'review': {0: 'Udah langganan disini. Seneng ...,Cough & Flu Essential Oil Blend Batuk Pilek W ...,{'nama_produk': 'Cough & Flu Essential Oil Ble...,nama Cough & Flu Essential Oil Blend Batuk Pil...
3,"{'review': {0: 'terimakasih👍👍', 1: 'Paket dite...",Palmolive Naturals Milk & Orchid Sabun Mandi 1...,{'nama_produk': 'Palmolive Naturals Milk & Orc...,nama Palmolive Naturals Milk & Orchid Sabun Ma...
4,"{'review': {0: 'Pengiriman cepat, packing rapi...",Slimsure Burn Fat Capsule (Menghancurkan dan M...,{'nama_produk': 'Slimsure Burn Fat Capsule (Me...,nama Slimsure Burn Fat Capsule (Menghancurkan ...


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [8]:
n_token_text = []
for text in df['text']:
    tokens = tokenizer(text)['input_ids']
    n_token_text.append(len(tokens))
df['n_token_text'] = n_token_text

print(f"""N Token: {df['n_token_text'].sum()}
Rerata: {df['n_token_text'].mean()}
Biaya Text Completion dengan Davinci:
N Token * 0.00002 = ${round(0.00002 * df['n_token_text'].sum(), 3)} *belum termasuk response
""")

N Token: 197578
Rerata: 358.58076225045374
Biaya Text Completion dengan Davinci:
N Token * 0.00002 = $3.952 *belum termasuk response



# Create Synthetic Data

In [9]:
def generate_text(text):
    openai.api_key = "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"

    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=f"{text}",
    temperature=0,
    max_tokens=700,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )
    return response

In [10]:
# df['response_davinci'] = None

In [11]:
# for i in range(440, 441):
#     while True:
#         try:
#             r = generate_text(df['text'][i])
#             df['response_davinci'][i] = r.to_dict()
#             df.to_csv('data_full.csv')
#             print(f"Berhasil {i+1}/{len(texts)}", end='\r')
#             break
#         except Exception as e:
#             if 'Error communicating with OpenAI' in e or 'ServiceUnavailableError' in e:
#                 pass
#             else:
#                 print(e)

In [12]:
df = pd.read_csv('data_full.csv').drop('Unnamed: 0', axis=1)
df.tail()

FileNotFoundError: [Errno 2] No such file or directory: 'data_full.csv'

# Cost Estimation

### Cost for Creating Synthetic Data

In [None]:
n_token_completion = []
n_total_token = []
for i in range(len(df)):
    dct = json.loads(df['response_davinci'][i])
    n_comp = dct['usage']['completion_tokens']
    n_total = dct['usage']['total_tokens']
    n_token_completion.append(n_comp)
    n_total_token.append(n_total)

df['n_token_completion'] = n_token_completion
df['n_total_token'] = n_total_token

print(f"""Total token: {df['n_total_token'].sum()}
Cost: $0.00002 * {df['n_total_token'].sum()} = ${round(df['n_total_token'].sum()*0.00002, 3)}""")

Total token: 348285
Cost: $0.00002 * 348285 = $6.966


### Cost for fine tuning

In [None]:
prompt = []
for i in range(len(df)):
    p = df['text'][i]
    p = p.replace('\n', ' ')
    p = p.replace('         ', ' ')
    p = p[5:-584]
    p += ' ->'
    prompt.append(p)

In [None]:
response = []
rd = df['response_davinci']

for i in range(len(df)):
    r = json.loads(rd[i])['choices'][0]['text'].strip() + ' END'
    response.append(r)

In [None]:
def n_token(list_):
    n_token = 0
    for i in list_:
        n = len(tokenizer(i)['input_ids'])
        n_token += n
    return n_token

In [None]:
print(f"""Token Prompt: {n_token(prompt)}
Token response: {n_token(response)}
Total Token: {n_token(prompt) + n_token(response)}
N Epochs: 2
Cost: $0.000003 * {n_token(prompt) + n_token(response)} * 2 = ${round(0.000003 * (n_token(prompt) + n_token(response)) * 2, 3)}""")

Token Prompt: 70407
Token response: 159402
Total Token: 229809
N Epochs: 2
Cost: $0.000003 * 229809 * 2 = $1.379


# Data Preparation

In [None]:
df['prompt'] = prompt
df['completion'] = response

dataset = df[['prompt', 'completion']]
dataset.head()

Unnamed: 0,prompt,completion
0,Waterspot Remover - Obat Anti Jamur Penghilang...,Waterspot Remover adalah obat anti jamur yang ...
1,"NPURE NONI PROBIOTICS ""BALANCE ME"" FINE TONER ...","NPURE NONI PROBIOTICS ""BALANCE ME"" FINE TONER ..."
2,Cough & Flu Essential Oil Blend Batuk Pilek W ...,Cough & Flu Essential Oil Blend Batuk Pilek W ...
3,Palmolive Naturals Milk & Orchid Sabun Mandi 1...,Palmolive Naturals Milk & Orchid Sabun Mandi 1...
4,Slimsure Burn Fat Capsule (Menghancurkan dan M...,Slimsure Burn Fat Capsule adalah produk yang d...


In [None]:
dataset.to_csv('dataset.csv')