In [1]:

import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os
import openai
from tqdm import tqdm
import tiktoken
import warnings

load_dotenv("../secrets/openai.env")
api_key = os.getenv("OPENAI_API_KEY")

In [2]:
# define some paths
path_raw = Path("../data/raw")
path_processed = Path("../data/processed")
df_train = pd.read_csv(path_raw / "train.csv")
df_train

Unnamed: 0,Id,GameRulesetName,agent1,agent2,Properties,Format,Time,Discrete,Realtime,Turns,...,DoLudeme,Trigger,PlayoutsPerSecond,MovesPerSecond,EnglishRules,LudRules,num_wins_agent1,num_draws_agent1,num_losses_agent1,utility_agent1
0,0,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-ProgressiveHistory-0.6-Random200-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",4,0,11,-0.466667
1,1,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-UCB1GRAVE-0.6-NST-true,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
2,2,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.1-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",7,0,8,-0.066667
3,3,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.6-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
4,4,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,1,298.07,18877.17,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233229,233229,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-NST-false,MCTS-ProgressiveHistory-1.41421356237-Random20...,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",2,0,13,-0.733333
233230,233230,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1-0.6-MAST-false,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",9,1,5,0.266667
233231,233231,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",11,3,1,0.666667
233232,233232,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-true,1,1,1,1,0,1,...,0,0,157.52,157174.58,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {...",24,2,4,0.666667


In [3]:
# separate numerical and categorical columns
numerical_cols = df_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df_train.select_dtypes(include=['object']).columns

In [4]:
df_train[categorical_cols]

Unnamed: 0,GameRulesetName,agent1,agent2,EnglishRules,LudRules
0,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-ProgressiveHistory-0.6-Random200-false,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ..."
1,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-UCB1GRAVE-0.6-NST-true,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ..."
2,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.1-NST-false,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ..."
3,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.6-NST-false,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ..."
4,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1GRAVE-1.41421356237-NST-false,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ..."
...,...,...,...,...,...
233229,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-NST-false,MCTS-ProgressiveHistory-1.41421356237-Random20...,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {..."
233230,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1-0.6-MAST-false,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {..."
233231,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-false,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {..."
233232,Zuz_Mel_7x7,MCTS-UCB1Tuned-1.41421356237-Random200-false,MCTS-UCB1GRAVE-1.41421356237-NST-true,7x7 board. 24 pieces per player. Pieces begin ...,"(game ""Zuz Mel (7x7)"" (players 2) (equipment {..."


In [5]:
# Create a new string column by concatenating GameRulesetName, EnglishRules, and LudRules
df_train['combined_rules'] = (
    "GameRulesetName: " + df_train['GameRulesetName'].astype(str) + 
    " || EnglishRules: " + df_train['EnglishRules'].astype(str) + 
    " || LudRules: " + df_train['LudRules'].astype(str)
)

# Display the first few rows of the new column to verify
print(df_train['combined_rules'].head())


0    GameRulesetName: 00Y || EnglishRules: Goal: Co...
1    GameRulesetName: 00Y || EnglishRules: Goal: Co...
2    GameRulesetName: 00Y || EnglishRules: Goal: Co...
3    GameRulesetName: 00Y || EnglishRules: Goal: Co...
4    GameRulesetName: 00Y || EnglishRules: Goal: Co...
Name: combined_rules, dtype: object


In [6]:
df_train.combined_rules.nunique()

1377

***
### rewrite rules to english

In [7]:
client = openai.OpenAI(api_key=api_key)
client

<openai.OpenAI at 0x3206a1c90>

In [8]:
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def rewrite_rules(text):
    prompt = f"""The following text contains game rules described in Ludii's Game Description Language (GDL). 
    Rewrite these rules in clear, concise, and unambiguous English. Maintain all important information and be as complete as possible. 
    Remove any redundancies or technical GDL syntax, but ensure that no crucial details are lost. 
    The output should preserve all essential rules and game mechanics:

    {text}

    Rewritten rules (be thorough and unambiguous):"""
    
    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", 
             "content": "You are a helpful assistant that rewrites game rules from Ludii's Game Description Language (GDL) into clear, complete, and unambiguous English."
            },
            {"role":
             "user", "content": prompt
            }
        ],
        max_tokens=8100,
        temperature=0.0  # Set to 0 for maximum determinism
    )
    return response.choices[0].message.content.strip()

# Create a new column for the rewritten rules
df_train['rewritten_rules'] = ''

# Process each unique combined_rules
unique_rules = df_train['combined_rules'].unique()

for rule in tqdm(unique_rules, desc="Rewriting rules"):
    rewritten = rewrite_rules(rule)
    
    # Check token count and truncate if necessary
    if num_tokens_from_string(rewritten) > 8191: 
        warnings.warn(
            f"Rewritten rules exceed token limit. Original token count: {num_tokens_from_string(rule)}"
            f", Rewritten token count: {num_tokens_from_string(rewritten)}")
    
    df_train.loc[df_train['combined_rules'] == rule, 'rewritten_rules'] = rewritten

# Display a sample of original and rewritten rules
print(df_train[['combined_rules', 'rewritten_rules']].head())

Rewriting rules: 100%|██████████| 1377/1377 [2:21:11<00:00,  6.15s/it] 

                                      combined_rules  \
0  GameRulesetName: 00Y || EnglishRules: Goal: Co...   
1  GameRulesetName: 00Y || EnglishRules: Goal: Co...   
2  GameRulesetName: 00Y || EnglishRules: Goal: Co...   
3  GameRulesetName: 00Y || EnglishRules: Goal: Co...   
4  GameRulesetName: 00Y || EnglishRules: Goal: Co...   

                                     rewritten_rules  
0  **Game Name: 00Y**\n\n**Objective:**\nThe goal...  
1  **Game Name: 00Y**\n\n**Objective:**\nThe goal...  
2  **Game Name: 00Y**\n\n**Objective:**\nThe goal...  
3  **Game Name: 00Y**\n\n**Objective:**\nThe goal...  
4  **Game Name: 00Y**\n\n**Objective:**\nThe goal...  





***
### create embeddings

In [14]:
def get_embedding(text, model="text-embedding-3-large", dimensions=512):
   text = text.replace("\n", " ")
   return client.embeddings.create(
      input = [text],
      model=model,
      encoding_format="float",
      dimensions=dimensions
    ).data[0].embedding

In [None]:
# Create a dictionary to store embeddings for each rewritten rule
embedding_dict = {}

unique_rewritten_rules = df_train['rewritten_rules'].unique()

# Get embeddings for each rewritten_rules string
for rule in tqdm(unique_rewritten_rules, desc="Creating embeddings"):
    embedding = get_embedding(rule, dimensions=512)
    embedding_dict[rule] = embedding

# Create a new dataframe with Id, rewritten_rules, and embeddings
df_embeddings = df_train[['Id', 'rewritten_rules']].copy()
df_embeddings['embedding'] = df_embeddings['rewritten_rules'].map(embedding_dict)


In [15]:
# Create a dictionary to store embeddings for each rewritten rule
embedding_dict = {}

unique_rewritten_rules = df_train['rewritten_rules'].unique()

# Get embeddings for each rewritten_rules string
for rule in tqdm(unique_rewritten_rules, desc="Creating embeddings"):
    embedding = get_embedding(rule, dimensions=512)
    embedding_dict[rule] = embedding

# Create a new dataframe with Id, rewritten_rules, and individual embedding dimensions
df_embeddings = df_train[['Id', 'rewritten_rules']].copy()
embedding_df = pd.DataFrame(
    df_embeddings['rewritten_rules'].map(embedding_dict).tolist(),
    columns=[f'embedding_{i}' for i in range(1, 513)]
)
df_embeddings = pd.concat([df_embeddings, embedding_df], axis=1)

Creating embeddings: 100%|██████████| 1377/1377 [14:03<00:00,  1.63it/s]


In [19]:
df_embeddings.to_csv(path_processed/'embeddings.csv', index=False)
df_embeddings.to_parquet(path_processed/'embeddings.parquet', index=False)


***