# Memahami Data Pantun Yang Baru kita Kikis (Scraping) dari laman sesawang
Berpandukan laman sesawang sejuta pantun, saya telah pun mengikis sehingga 400 m/s pantun dengan jumlah melebih 5000 koleksi pantun. Koleksi pantun ini perlu disusun dalama bentuk dataset yang membolehkan kita melakukan fine-tuning.

In [1]:
import csv

# Define the input and output file paths
input_file = 'pantun_dataset2.txt'
output_file = 'structured_pantun.csv'

# Open the input file and parse its content
with open(input_file, 'r', encoding='utf-8') as f:
    pantuns = f.read().strip().split('\n\n')  # Split by double newline to get each pantun stanza

# Create the CSV file to save the structured data
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Pantun_ID', 'Line_1', 'Line_2', 'Line_3', 'Line_4']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Iterate over each pantun and write to CSV
    for i, pantun in enumerate(pantuns):
        lines = pantun.split('\n')
        if len(lines) == 4:  # Ensure it's a complete pantun with 4 lines
            writer.writerow({
                'Pantun_ID': f'Pantun_{i + 1}',
                'Line_1': lines[0],
                'Line_2': lines[1],
                'Line_3': lines[2],
                'Line_4': lines[3]
            })

print("Pantun structuring completed. Data saved to structured_pantun.csv.")

Pantun structuring completed. Data saved to structured_pantun.csv.


In [2]:
import pandas as pd
import numpy as np

df= pd.read_csv('structured_pantun.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2428 entries, 0 to 2427
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Pantun_ID  2428 non-null   object
 1   Line_1     2428 non-null   object
 2   Line_2     2428 non-null   object
 3   Line_3     2428 non-null   object
 4   Line_4     2428 non-null   object
dtypes: object(5)
memory usage: 95.0+ KB


In [3]:
df.tail(5)

Unnamed: 0,Pantun_ID,Line_1,Line_2,Line_3,Line_4
2423,Pantun_16893,"Hujan dimuara air nya deras,",Payong sekaki kongsi sama;,"Radio klasik sentiasa di hias,",Dengan lagu baru dan lama.
2424,Pantun_16894,"Hujan di muara airnya deras,",Payong sekaki kongsi sama;,"Banjar datang harta benda terjejas,",Demi keselamatan jagalah bersama.
2425,Pantun_16895,Hujan dimuara airnya deras.,Payung sekaki berkongsi sama;,"Musim ketengkujuh perlu peka dan cerdas,",Keselamatan keluarga tanggungjawab bersama.
2426,Pantun_16896,"Hujung dimuara airnya deras,",Payung sekali berkongsi sama;,"Bergotong royong janganlah malas,",Saat banjir nyawa diutama.
2427,Pantun_16897,"Terang bulan bagai diukir,",Tupai mengusik bunga durian;,"Sebarang perbuatan habiskan fikir,",Tidak menyesal hari kemudian.


In [29]:
import csv
import json

# Define the input and output file paths
input_file = 'pantun_dataset2.txt'
output_csv_file = 'structured_pantun.csv'
output_jsonl_file = 'pantun_finetune_dataset.jsonl'

# Open the input file and parse its content
with open(input_file, 'r', encoding='utf-8') as f:
    pantuns = f.read().strip().split('\n\n')  # Split by double newline to get each pantun stanza

# Create the CSV file to save the structured data
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Pantun_ID', 'Line_1', 'Line_2', 'Line_3', 'Line_4']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Iterate over each pantun and write to CSV
    for i, pantun in enumerate(pantuns):
        lines = pantun.split('\n')
        if len(lines) == 4:  # Ensure it's a complete pantun with 4 lines
            writer.writerow({
                'Pantun_ID': f'Pantun_{i + 1}',
                'Line_1': lines[0],
                'Line_2': lines[1],
                'Line_3': lines[2],
                'Line_4': lines[3]
            })

print("Pantun structuring completed. Data saved to structured_pantun.csv.")

# Create the JSONL file for fine-tuning
with open(output_jsonl_file, 'w', encoding='utf-8') as jsonlfile:
    with open(output_csv_file, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Create a prompt-completion pair
            prompt = f"{row['Line_1']}\n{row['Line_2']}"
            completion = f"\n{row['Line_3']}\n{row['Line_4']}"
            
            # Write the pair in JSONL format
            json.dump({"prompt": prompt, "completion": completion}, jsonlfile)
            jsonlfile.write('\n')

print("Pantun fine-tuning dataset saved to pantun_finetune_dataset.jsonl.")

Pantun structuring completed. Data saved to structured_pantun.csv.
Pantun fine-tuning dataset saved to pantun_finetune_dataset.jsonl.


In [5]:
# Code to view the JSONL file in Jupyter Notebook
import pandas as pd

# Load the JSONL file into a DataFrame for easy viewing
jsonl_file = 'pantun_finetune_dataset.jsonl'

with open(jsonl_file, 'r', encoding='utf-8') as f:
    lines = [json.loads(line) for line in f]

# Create a DataFrame from the JSONL lines
df2 = pd.DataFrame(lines)

# Display the first few rows of the DataFrame
df2.head()

Unnamed: 0,prompt,completion
0,"Burung dara, burung nuri,\nBernyanyi bersama d...","\nTak kira berapa yg diberi,\nBiar keikhlasan ..."
1,"Tangan memberi dengan ikhlas,\nTangan menerima...","\nSenyum ku beri dia membalas,\nJalan kedepan ..."
2,"Tersandung kaki tersilap langkah,\nLutut berda...","\nSucikan hati dengan bersedekah,\nAgar hidup ..."
3,"Pokok turi tinggi segalah,\nDi hujung taman te...","\nIkhlas memberi kerana ALLAH,\nSubur iman ber..."
4,"Majlis meriah ramai kenalan,\nAntara disaji pu...","\nSedekah jariah bukan tontonan,\nBiarlah menj..."


In [6]:
!pip install tiktoken



In [7]:
import json
import pandas as pd
from dotenv import load_dotenv
import os
import tiktoken

# Load environment variables from .env file
load_dotenv()

# Load the JSONL file into a DataFrame for easy viewing
jsonl_file = 'pantun_finetune_dataset.jsonl'

with open(jsonl_file, 'r', encoding='utf-8') as f:
    lines = [json.loads(line) for line in f]

# Create a DataFrame from the JSONL lines
df = pd.DataFrame(lines)

# Display the first few rows of the DataFrame
df.head()

# Calculate the number of tokens for each prompt and completion
def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    return len(encoding.encode(string))

# Add columns for the number of tokens in each prompt and completion
df['prompt_tokens'] = df['prompt'].apply(lambda x: num_tokens_from_string(x))
df['completion_tokens'] = df['completion'].apply(lambda x: num_tokens_from_string(x))
df['total_tokens'] = df['prompt_tokens'] + df['completion_tokens']

# Display the total number of tokens in the dataset
total_tokens = df['total_tokens'].sum()
print("Total number of tokens in the dataset:", total_tokens)

Total number of tokens in the dataset: 103394


In [8]:
!pip install openai==0.28



In [19]:
#mari buat pantun
import openai
from dotenv import load_dotenv
import os
import tiktoken
import time

# Load environment variables from a .env file (optional, if you use one)
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Function to use the fine-tuned model
# Function to use the fine-tuned model, ensuring only 4-line stanza pantun is returned
def generate_pantun(prompt, model="ft:davinci-002:personal:pantun-eksperimen3:AHhDFKx3", temperature=0.7, max_tokens=150):
    response = openai.Completion.create(
        model=model,
        prompt=prompt,
        temperature=temperature,
        max_tokens=max_tokens,
        stop="\n\n"
    )
    pantun = response.choices[0].text.strip()
    # Ensure the generated response contains exactly 4 lines (4 stanza pantun)
    lines = pantun.split("\n")
    return ''.join(lines[:4]) 

# Example usage of the fine-tuned model
prompt = "Pergi mengail di tepi tasik"
response = generate_pantun(prompt)
print("Generated pantun:", response)

Generated pantun: 


In [26]:
import csv
import re

# Define the input and output file paths
input_file = 'pantun_dataset3.txt'
output_file = 'structured_pantun2.csv'

# Open the input file and parse its content
with open(input_file, 'r', encoding='utf-8') as f:
    pantuns = f.read().strip().split('\n\n')  # Split by double newline to get each pantun stanza

# Create the CSV file to save the structured data
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Pantun_ID', 'Line_1', 'Line_2', 'Line_3', 'Line_4']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Iterate over each pantun and write to CSV
    for i, pantun in enumerate(pantuns):
        # Split the pantun by punctuation marks followed by whitespace or newlines
        lines = re.split(r'[.,;!?]\s+|\n', pantun)
        lines = [line.strip() for line in lines if line.strip() != '']  # Remove empty lines
        
        # Group lines into sets of 4
        for j in range(0, len(lines), 4):
            if j + 4 <= len(lines):
                writer.writerow({
                    'Pantun_ID': f'Pantun_{i + 1}_{(j // 4) + 1}',
                    'Line_1': lines[j],
                    'Line_2': lines[j + 1],
                    'Line_3': lines[j + 2],
                    'Line_4': lines[j + 3]
                })

print("Pantun structuring completed. Data saved to structured_pantun2.csv.")

Pantun structuring completed. Data saved to structured_pantun2.csv.


In [27]:
import pandas as pd
import numpy as np

df3= pd.read_csv('structured_pantun2.csv')
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2998 entries, 0 to 2997
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Pantun_ID  2998 non-null   object
 1   Line_1     2998 non-null   object
 2   Line_2     2998 non-null   object
 3   Line_3     2998 non-null   object
 4   Line_4     2998 non-null   object
dtypes: object(5)
memory usage: 117.2+ KB


In [28]:
df3

Unnamed: 0,Pantun_ID,Line_1,Line_2,Line_3,Line_4
0,Pantun_2541_1,Lemang,lontong,rendang,"ketupat,"
1,Pantun_7181_1,Norma baru antaranya berwebinar,Penyertaan meluas ke luar negeri,Walaupun rentas negeri telah dibenar,Berhati-hatilah dan jagalah diri.
2,Pantun_7182_1,Rayuan datang demi rayuan,Jangan mudah undi diberi,Rumahtangga dibina dengan kesucian,Keberkatan hidup itulah yang dicari.
3,Pantun_7183_1,Masak gulai ikan tenggiri,Dicampur dengan santan kelapa,Usahlah digadai maruah diri,Laba duniawi yang tidak seberapa.
4,Pantun_7185_1,Kalau berkunjung ke negeri Perlis,Rasailah kari ikan nyuk-nyuk,eruskan berfikir,membaca menulis
...,...,...,...,...,...
2993,Pantun_19013_1,Ada 3 orang dalam sebuah kapal laut,seorang nabi,seorang maharaja dan seorang hamba abdi,Kapal laut itu pun tenggelam
2994,Pantun_19015_1,Biar mati anak,"jangan mati adat,Tak lapuk dek hujan",tak lekang dek panas.Bersatu kita teguh,bercerai kita roboh.Tak kenal maka tak cinta.
2995,Pantun_19017_1,"""Jika dunia ini persinggahan",mengapa tidak kita perbanyakkan bekalan untuk ...,"kerana kita cuma ada satu persinggahan"" ""Dalam...",Dalam kemiskinan ada harta kekayaan jiwa
2996,Pantun_19175_1,Beginilah semuanyaSakit bermula mati bersebabH...,maut,tanah berbaris,jodoh pertemuanSemuanya di tangan TuhanJadi se...


In [44]:
import csv
import json

# Define the input and output file paths
input_file = 'pantun_dataset2.txt'
output_csv_file = 'structured_pantun.csv'
output_jsonl_file = 'pantun_finetune_dataset3.jsonl'

# Open the input file and parse its content
with open(input_file, 'r', encoding='utf-8') as f:
    pantuns = f.read().strip().split('\n\n')  # Split by double newline to get each pantun stanza

# Create the CSV file to save the structured data
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Pantun_ID', 'Line_1', 'Line_2', 'Line_3', 'Line_4']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Iterate over each pantun and write to CSV
    for i, pantun in enumerate(pantuns):
        lines = pantun.split('\n')
        if len(lines) == 4:  # Ensure it's a complete pantun with 4 lines
            writer.writerow({
                'Pantun_ID': f'Pantun_{i + 1}',
                'Line_1': lines[0],
                'Line_2': lines[1],
                'Line_3': lines[2],
                'Line_4': lines[3]
            })

print("Pantun structuring completed. Data saved to structured_pantun.csv.")

# Create the JSONL file for fine-tuning
with open(output_jsonl_file, 'w', encoding='utf-8') as jsonlfile:
    with open(output_csv_file, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Create the JSONL structure for GPT-4o-mini fine-tuning
            jsonl_data = {
                "messages": [
                    {"role": "system", "content": "A pantun typically consists of four lines where the end of the first line rhymes with the end of the third line, and the end of the second line rhymes with the end of the fourth line. The lines should have a rhythmic flow and convey a meaningful message."},
                    {"role": "user", "content": row['Line_1']},
                    {"role": "assistant", "content": f"{row['Line_2']}; {row['Line_3']}; {row['Line_4']}"}
                ]
            }
            # Write the data in JSONL format
            json.dump(jsonl_data, jsonlfile)
            jsonlfile.write('\n')

print("Pantun fine-tuning dataset saved to pantun_finetune_dataset3.jsonl.")

Pantun structuring completed. Data saved to structured_pantun.csv.
Pantun fine-tuning dataset saved to pantun_finetune_dataset3.jsonl.


In [56]:
#mari buat pantun
import openai
from dotenv import load_dotenv
import os
import tiktoken
import time

# Load environment variables from a .env file (optional, if you use one)
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Function to use the fine-tuned model, ensuring only 4-line stanza pantun is returned
def generate_pantun(prompt, model="ft:gpt-4o-2024-08-06:personal:pantun-eksperimen3:AI95NfUz", temperature=0.7, max_tokens=150):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "Pantunbot is a pantun generator able to continue a pantun given a pantun line. A pantun typically consists of four lines where the end of the first line rhymes with the end of the third line, and the end of the second line rhymes with the end of the fourth line. The lines should have a rhythmic flow and convey a meaningful message."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    pantun = response['choices'][0]['message']['content'].strip()
    # Ensure the generated response contains exactly 4 lines (4 stanza pantun)
    lines = pantun.split("\n")
    return '\n'.join(lines[:4])

# Example usage of the fine-tuned model
prompt = "kurang pahala banyakkan dosa"
response = generate_pantun(prompt)
print("Generated pantun:", response)

Generated pantun: di bulan mulia banyakkan ibadah;; curiga jangan tanpa bicara,; aduannya palsu tiada sudah.
