In [None]:
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split

In [None]:
def read_table(db_path, table_name):
    conn = sqlite3.connect(db_path)
    table_content = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
    conn.close()
    return table_content

In [None]:
def group_texts_by_heading(df):
    d = {}

    for i in range(len(news)):
        row = news.iloc[i]
        heading = row["heading"]
        level = row["level"] - 1
        text = row["article_text"]
        if heading not in d:
            d[heading] = ["", "", ""]
        d[heading][level] = text
    
    return d

In [None]:
def filter_texts(dct):
    # leave only texts adapted for all 3 levels
    X = []

    for texts in dct.values():
        if "" not in texts:
            X.append(texts)
    
    return X

In [None]:
def get_train_dev_test(X):
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
    X_train, X_dev = train_test_split(X_train, test_size=0.25, random_state=42)
    return X_train, X_dev, X_test

In [None]:
def write_datasets_to_files(X_train, X_dev, X_test):
    datasets = {"train.txt": X_train, "validation.txt": X_dev, "test.txt": X_test}
    header = ("level 1", "level 2", "level 3")

    for filename, data in datasets.items():
        with open(filename, "w", encoding="utf-8") as f:
            f.write("\t".join(header)+"\n")
            f.write("\n".join("\t".join(row) for row in data))

In [None]:
news = read_table(
    db_path="/content/drive/MyDrive/english-proficiency/data/textsinlevels.db",
    table_name="newsinlevels"
    )

news.head()

Unnamed: 0,date,heading,article_text,level
0,15-01-2022 12:00,Test your English,We have a test for students of English. You ca...,1
1,14-01-2022 15:00,World’s best skater can go to the Olympics,Erin Jackson is an American athlete. She is th...,1
2,14-01-2022 07:00,Man with a pig heart,"David Bennet comes from Maryland, US. He is 57...",1
3,13-01-2022 15:00,New mirror in space,A rocket goes in space two weeks ago. It carri...,1
4,13-01-2022 07:00,Hainan towers,The Evergrande Group is a Chinese company. It ...,1


In [None]:
dct = group_texts_by_heading(news)
X = filter_texts(dct)
X_train, X_dev, X_test = get_train_dev_test(X)

In [None]:
print("Train:", len(X_train) * 3)
print("Validation:", len(X_dev) * 3)
print("Test:", len(X_test) * 3)

Train: 5844
Validation: 1950
Test: 1950


In [None]:
X_train[0]

['Prince Charles is the eldest son of Queen Elizabeth II. He is 71 years old. He is the future King of the UK. Prince Charles is ill. He has the coronavirus. His illness is not very strong. He does not have other health problems. His wife, Camilla, is healthy. Charles and Camilla are in Scotland now. They stay in quarantine . It is not clear when Charles becomes ill. He meets a lot of people. In the UK, there are 11,600 cases of coronavirus. Almost 600 people die. It is not safe for the Queen to stay in London. The Queen is 93 years old. She is healthy. However, she leaves London. She stays at Windsor Castle. It is 19 miles from London.',
 'Doctors diagnosed Prince Charles with the coronavirus. He has mild symptoms and apart from the coronavirus he is in good health. It is not clear when he became ill because he has attended many public events recently. Charles is 71 years old, and he is Queen Elizabeth II´s eldest son. He is the first in line to the British throne. His wife, Camilla, 

In [None]:
write_datasets_to_files(X_train, X_dev, X_test)