# Generate synthetic CEFR data

In a first step we use LLMs to generate synthetic text samples in the [6 CEFR language levels from A1 to C2](https://www.coe.int/en/web/common-european-framework-reference-languages/level-descriptions). To generate a wide variety of samples we use Claude Haiku and Sonnet and GPT-4o and GPT-4o-mini models as well as Phi-3 and Gemma-2.

From [our text simplification project](https://github.com/machinelearningZH/simply-simplify-language) we know, that LLMs to some degree can be steered towards the CEFR levels as well as Einfache or Leichte Sprache. We therefore assume that such text samples were in the training data. We can leverage this to our advantage and create a reference dataset to map our understandability scores to CEFR levels. We acknowledge that this is an educated guess and obviously not to be considered as ground truth.

**Imports**

In [2]:
import pandas as pd
import numpy as np
import os
import glob
import re
from tqdm.notebook import tqdm
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500

**Constants and functions**

In [3]:
load_dotenv()

ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
from utils_prompts import (
    BASE_PROMPT_SITUATIONS,
    BASE_PROMPT_SWISS,
    BASE_PROMPT_TOPICS,
    MODIFIERS_SITUATIONS,
    MODIFIERS_SWISS,
    MODIFIERS_TOPICS,
)

SYSTEM_MESSAGE = """"Du bist ein Experte in der deutschen Sprache. Du kennst dich exzellent mit den CEFR-Sprachniveaus von A1 bis C2 aus. Du kannst Texte sehr gut in diesen verschiedenen Niveaus schreiben. Du schreibst immer auf Deutsch."""

Instantiate client to use with [LM Studio Server](https://lmstudio.ai/).

In [16]:
lmstudio_client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")


def call_lmstudio(prompt, temperature=0.5):
    try:
        completion = lmstudio_client.chat.completions.create(
            model="bartowski/Phi-3-medium-4k-instruct-GGUF",
            messages=[
                {"role": "system", "content": SYSTEM_MESSAGE},
                {"role": "user", "content": prompt},
            ],
            temperature=temperature,
        )

        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error: {e}")
        return None

Instantiate clients for Anthropic and OpenAI APIs.

In [5]:
anthropic_client = Anthropic(api_key=ANTHROPIC_API_KEY)

HAIKU = "claude-3-haiku-20240307"
SONNET = "claude-3-5-sonnet-20240620"


def call_anthropic(
    prompt, model_id="claude-3-haiku-20240307", temperature=0.5, max_tokens=4096
):
    try:
        message = anthropic_client.messages.create(
            model=model_id,
            max_tokens=max_tokens,
            temperature=temperature,
            system=SYSTEM_MESSAGE,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        )
        return message.content[0].text

    except Exception as e:
        print(f"Error: {e}")
        return None


openai_client = OpenAI()

GPT4O_MINI = "gpt-4o-mini"
GPT4O = "gpt-4o"


def call_openai(prompt, model_id=GPT4O_MINI, temperature=0.5, max_tokens=4096):
    try:
        completion = openai_client.chat.completions.create(
            model=model_id,
            temperature=temperature,
            max_tokens=max_tokens,
            messages=[
                {"role": "system", "content": SYSTEM_MESSAGE},
                {"role": "user", "content": prompt},
            ],
        )
        return completion.choices[0].message.content

    except Exception as e:
        print(f"Error: {e}")
        return None

In [6]:
def parse_response(response):
    """Extract the A1, A2, B1, B2, C1, C2 text spans from the response."""
    a1 = re.findall(r"<A1>(.*?)<", response, re.DOTALL)
    a2 = re.findall(r"<A2>(.*?)<", response, re.DOTALL)
    b1 = re.findall(r"<B1>(.*?)<", response, re.DOTALL)
    b2 = re.findall(r"<B2>(.*?)<", response, re.DOTALL)
    c1 = re.findall(r"<C1>(.*?)<", response, re.DOTALL)
    c2 = re.findall(r"<C2>(.*?)<", response, re.DOTALL)
    data = [a1, a2, b1, b2, c1, c2]
    try:
        data = [x[0].strip() for x in data]
    except:
        data = [None for x in data]
    return pd.DataFrame(data).T

## Create synthetic data with LM Studio

In [None]:
tmp_results = []
for modifier in tqdm(MODIFIERS_SITUATIONS):
    print(modifier)
    prompt = BASE_PROMPT_SITUATIONS.format(prompt=modifier)
    result = call_lmstudio(prompt)
    result = parse_response(result)
    result["modifier"] = modifier
    tmp_results.append(result)

data = pd.concat(tmp_results)
data.columns = ["A1", "A2", "B1", "B2", "C1", "C2", "modifier"]
data.reset_index(drop=True, inplace=True)

data = data.replace("", None).dropna()
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
data.to_parquet("_input/phi34kit_situations.parq")

## Create synthetic data with Claude models

In [None]:
model = HAIKU
model_name = "haiku"

tmp_results = []
for modifier in tqdm(MODIFIERS_SITUATIONS):
    print(modifier)
    prompt = BASE_PROMPT_SITUATIONS.format(prompt=modifier)
    result = call_anthropic(prompt, model_id=model)
    result = parse_response(result)
    result["modifier"] = modifier
    tmp_results.append(result)
data = pd.concat(tmp_results)
data.columns = ["A1", "A2", "B1", "B2", "C1", "C2", "modifier"]
data.reset_index(drop=True, inplace=True)
data["model"] = model_name
data["model_id"] = model

In [46]:
data.to_parquet("_input/haiku_situations.parq")

## Create synthetic data with GPT-4 models

In [None]:
tmp_results = []
for modifier in tqdm(MODIFIERS_SITUATIONS):
    print(modifier)
    prompt = BASE_PROMPT_SITUATIONS.format(prompt=modifier)
    result = call_openai(prompt, model_id=GPT4O_MINI)
    result = parse_response(result)
    result["modifier"] = modifier
    tmp_results.append(result)

data = pd.concat(tmp_results)
data.columns = ["A1", "A2", "B1", "B2", "C1", "C2", "modifier"]
data.reset_index(drop=True, inplace=True)
data["model"] = "GPT-4o-mini"
data["model_id"] = GPT4O_MINI

# GPT-4o sometimes gets the tags wrong. We drop these erroneous rows.
data = data.replace("", None).dropna()
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)

In [35]:
data.to_parquet("_input/gpt4o-mini_situations.parq")

# Combine synthetic data to final dataset

In [7]:
file_paths = glob.glob("_input/_synthetic-data/*.parq")
file_paths = sorted(file_paths)
file_paths

['_input/_synthetic-data/gemma2it9b_situations.parq',
 '_input/_synthetic-data/gemma2it9b_swiss.parq',
 '_input/_synthetic-data/gemma2it9b_topics.parq',
 '_input/_synthetic-data/gpt4o-mini_situations.parq',
 '_input/_synthetic-data/gpt4o-mini_swiss.parq',
 '_input/_synthetic-data/gpt4o-mini_topics.parq',
 '_input/_synthetic-data/gpt4o_situations.parq',
 '_input/_synthetic-data/gpt4o_swiss.parq',
 '_input/_synthetic-data/gpt4o_topics.parq',
 '_input/_synthetic-data/haiku_situations.parq',
 '_input/_synthetic-data/haiku_swiss.parq',
 '_input/_synthetic-data/haiku_topics.parq',
 '_input/_synthetic-data/phi34kit_situations.parq',
 '_input/_synthetic-data/phi34kit_swiss.parq',
 '_input/_synthetic-data/phi34kit_topics.parq',
 '_input/_synthetic-data/sonnet_situations.parq',
 '_input/_synthetic-data/sonnet_swiss.parq',
 '_input/_synthetic-data/sonnet_topics.parq']

Gemma

In [8]:
gemmas = [x for x in file_paths if "gemma" in x]
frames = []
for frame in gemmas:
    model = frame.split("/")[-1].split("_")[0]
    topic = frame.split("/")[-1].split("_")[1].split(".")[0]
    data = pd.read_parquet(frame)
    data["model"] = model
    data["topic"] = topic
    frames.append(data)

df_gemma = pd.concat(frames).reset_index(drop=True)

# Drop additional topics for now.
df_gemma = df_gemma[df_gemma.topic.isin(["situations", "swiss", "topics"])]
df_gemma.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   A1        120 non-null    object
 1   A2        120 non-null    object
 2   B1        120 non-null    object
 3   B2        120 non-null    object
 4   C1        120 non-null    object
 5   C2        120 non-null    object
 6   model     120 non-null    object
 7   topic     120 non-null    object
 8   modifier  50 non-null     object
dtypes: object(9)
memory usage: 8.6+ KB


Phi-3

In [9]:
phis = [x for x in file_paths if "phi" in x]
frames = []
for frame in phis:
    model = frame.split("/")[-1].split("_")[0]
    topic = frame.split("/")[-1].split("_")[1].split(".")[0]
    data = pd.read_parquet(frame)
    data["model"] = model
    data["topic"] = topic
    frames.append(data)

df_phi = pd.concat(frames).reset_index(drop=True)
df_phi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   A1        120 non-null    object
 1   A2        120 non-null    object
 2   B1        120 non-null    object
 3   B2        120 non-null    object
 4   C1        120 non-null    object
 5   C2        120 non-null    object
 6   modifier  120 non-null    object
 7   model     120 non-null    object
 8   topic     120 non-null    object
dtypes: object(9)
memory usage: 8.6+ KB


GPT-4o

In [10]:
gpt4os = [x for x in file_paths if "gpt4o_" in x]
frames = []
for frame in gpt4os:
    topic = frame.split("/")[-1].split("_")[1].split(".")[0]
    data = pd.read_parquet(frame)
    data["topic"] = topic
    frames.append(data)

df_gpt4o = pd.concat(frames).reset_index(drop=True)
df_gpt4o.drop_duplicates(inplace=True)
df_gpt4o.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   A1        120 non-null    object
 1   A2        120 non-null    object
 2   B1        120 non-null    object
 3   B2        120 non-null    object
 4   C1        120 non-null    object
 5   C2        120 non-null    object
 6   modifier  120 non-null    object
 7   model     120 non-null    object
 8   model_id  120 non-null    object
 9   topic     120 non-null    object
dtypes: object(10)
memory usage: 9.5+ KB


GPT-4o-mini

In [11]:
gpt4minis = [x for x in file_paths if "gpt4o-mini" in x]
frames = []
for frame in gpt4minis:
    topic = frame.split("/")[-1].split("_")[1].split(".")[0]
    data = pd.read_parquet(frame)
    data["topic"] = topic
    frames.append(data)

df_gpt4mini = pd.concat(frames).reset_index(drop=True)
df_gpt4mini.drop_duplicates(inplace=True)
df_gpt4mini.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   A1        120 non-null    object
 1   A2        120 non-null    object
 2   B1        120 non-null    object
 3   B2        120 non-null    object
 4   C1        120 non-null    object
 5   C2        120 non-null    object
 6   modifier  120 non-null    object
 7   model     120 non-null    object
 8   model_id  120 non-null    object
 9   topic     120 non-null    object
dtypes: object(10)
memory usage: 9.5+ KB


Haiku

In [12]:
haikus = [x for x in file_paths if "haiku" in x]
frames = []
for frame in haikus:
    topic = frame.split("/")[-1].split("_")[1].split(".")[0]
    data = pd.read_parquet(frame)
    data["topic"] = topic
    frames.append(data)

df_haiku = pd.concat(frames).reset_index(drop=True)
df_haiku.drop_duplicates(inplace=True)
df_haiku.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   A1        120 non-null    object
 1   A2        120 non-null    object
 2   B1        120 non-null    object
 3   B2        120 non-null    object
 4   C1        120 non-null    object
 5   C2        120 non-null    object
 6   modifier  120 non-null    object
 7   model     120 non-null    object
 8   model_id  120 non-null    object
 9   topic     120 non-null    object
dtypes: object(10)
memory usage: 9.5+ KB


Sonnet

In [13]:
sonnets = [x for x in file_paths if "sonnet" in x]
frames = []
for frame in sonnets:
    topic = frame.split("/")[-1].split("_")[1].split(".")[0]
    data = pd.read_parquet(frame)
    data["topic"] = topic
    frames.append(data)

df_sonnet = pd.concat(frames).reset_index(drop=True)
df_sonnet.drop_duplicates(inplace=True)
df_sonnet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   A1        120 non-null    object
 1   A2        120 non-null    object
 2   B1        120 non-null    object
 3   B2        120 non-null    object
 4   C1        120 non-null    object
 5   C2        120 non-null    object
 6   modifier  120 non-null    object
 7   model     120 non-null    object
 8   model_id  120 non-null    object
 9   topic     120 non-null    object
dtypes: object(10)
memory usage: 9.5+ KB


In [14]:
df = pd.concat([df_gemma, df_gpt4o, df_gpt4mini, df_haiku, df_sonnet, df_phi])
df.reset_index(drop=True, inplace=True)
df.drop(columns=["modifier", "model_id"], inplace=True)
df.to_parquet("_input/cefr_synthetic.parq")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A1      720 non-null    object
 1   A2      720 non-null    object
 2   B1      720 non-null    object
 3   B2      720 non-null    object
 4   C1      720 non-null    object
 5   C2      720 non-null    object
 6   model   720 non-null    object
 7   topic   720 non-null    object
dtypes: object(8)
memory usage: 45.1+ KB


In [15]:
df.model.value_counts()

model
gemma2it9b     120
GPT-4o         120
GPT-4o-mini    120
Haiku          120
Sonnet         120
phi34kit       120
Name: count, dtype: int64