In [50]:
from pathlib import Path
import pandas as pd
import os
import re

# Main function definition

In [51]:
def to_sketch_format(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.replace(r'\s+', ' ', regex=True)
            df[col] = df[col].str.replace(r'"', '', regex=False)
            df[col] = df[col].str.replace(r"'", '', regex=False)
            df[col] = df[col].str.replace(r'<', '', regex=False)
            df[col] = df[col].str.replace(r'>', '', regex=False)
            df[col] = df[col].str.replace(r'\\', '', regex=False)
        df[col] = df[col].astype(str)
    sketch_start = "<doc text_id=\"" + df['id'] + "\" "
    sketch_end = "\" </doc>"
    meta_data = ""
    for col in df.columns:
        if col not in ['id', 'text']:
            meta_data += " " + col + "=\"" + df[col] + "\""
    return sketch_start + meta_data + " > \"" + df["text"] + sketch_end

def post_process(data, file):
    data['sketch'].to_csv(file, index=False, header=False)
    with open(file, "r") as f:
        text = f.read()
        text = re.sub(r'\"+', "\"", text)


    with open(file, "w") as f:
        f.write(text)

In [52]:
df = pd.DataFrame({
    'id': [1, 2],
    'source': ["example_source 1", "example_source 2"],
    'department': ['dep', None],
    'date': ["2023-10-01", "2024-10-01"],
    'title': ["example title 1", "example title 2"],
    'text': ["example text 1", "example text 2"]
    
},
)

df['sketch'] = to_sketch_format(df)
file = "data.txt"
post_process(df, file)

# Korpus to sketch format

In [53]:

drive_path = Path("../Korpus")
data = []
report_file = drive_path / "report.txt"
total_rows = 0  # Initialize total row counter
with open(report_file, "w") as report:
    for file in os.listdir(drive_path):
        if file.endswith(".csv"):
            report.write(f"---- {file} -----\n")
            df = pd.read_csv(drive_path / file)
            if 'department' not in df.columns:
                df['department'] = 'Not specified'
            df = df[['id', 'source', 'title', 'author', 'text', 'date', 'department', "eco_id"]]

            report.write(f"Found: {len(df)}\n")
            total_rows += len(df)  # Add the number of rows in the current file to the total
            present_percentage = (1 - df.isnull().mean()) * 100
            report.write(f"Present percentage for {file}:\n")
            for col, pct in present_percentage.items():
                report.write(f"{col:15}: {pct:6.2f}%\n")
            data.append(df)
    report.write(f"\nTotal docs across all files: {total_rows}\n")  # Write the total to the report


In [54]:
import re

def process_dfs(dfs):
    corpus = 0

    for df in dfs:
        # Convert to sketch format
        df['sketch'] = to_sketch_format(df)
        # Save to file
        file = "sketch_" + df['source'][0] + ".txt"
        corpus += len(df)
        post_process(df, file)
    return corpus

total_corpus = process_dfs(data)
print("Total corpus size:", total_corpus)

Total corpus size: 19986


In [55]:
!tar -czvf sketch.tar.gz sketch*.txt

a sketch_DoRzeczy.txt
a sketch_Gazeta Polska Codziennie.txt
a sketch_Newsweek.txt
a sketch_POLITYKA.txt
a sketch_Rzeczpospolita.txt
a sketch_WPROST.txt
a sketch_Wyborcza.txt
a sketch_wPolityce.txt


In [56]:
! rm sketch*.txt

# Global and zmiana klimat to sketch format

In [57]:
!ls ../UFA/files/results/

global ociepl_articles.csv       zmian klimat_articles.csv
global ociepl_s_60_w_158.csv     zmian klimat_plot_s_78_w_205.csv


In [None]:
df_zk = pd.read_csv("../UFA/files/results/zmian klimat_articles.csv")
df_gc = pd.read_csv("../UFA/files/results/global ociepl_articles.csv")
for df in [df_zk, df_gc]:
    df['eco_id'] = df['eco_id'].astype('Int32')

Index(['text', 'id', 'date', 'source', 'collocations', 'eco_id'], dtype='object') Index(['text', 'id', 'date', 'source', 'collocations', 'eco_id'], dtype='object')


In [65]:
df_zk['sketch'] = to_sketch_format(df_zk)
df_gc['sketch'] = to_sketch_format(df_gc)

In [66]:
post_process(df_gc, "global_ocieplenie_sketch.txt")
post_process(df_zk, "zmiana_klimat_sketch.txt")