In [1]:
from pathlib import Path
patentmatch_dataset_root = Path(r'C:\workspace_or_private\repos\runi-thesis-project\hidrive')
test_data_path = patentmatch_dataset_root / 'patentmatch_test'

In [2]:
tsv_file = test_data_path / 'patentmatch_test.tsv'

In [3]:
import pandas as pd
df = pd.read_csv(tsv_file, sep='\t')

In [4]:
len(df['text_b'])

1251940

In [13]:
jsonl_var = df.head().to_json(orient='records', lines=True)    

In [9]:
# Write the reduced df to csv
reduced_csv_file = test_data_path / 'patentmatch_test_no_claims.csv'
df.to_csv(reduced_csv_file, index=False)

In [8]:
import pandas as pd
import tiktoken
import statistics

# Example: Load your data frame (assumed already loaded in df)
# df = pd.read_csv("some_file.csv")

# Create an encoding object (adjust as needed for your model)
enc = tiktoken.get_encoding("cl100k_base")

# --- 1) Compute token counts and char counts for each column ---

# 'text'
token_counts_text = [len(enc.encode(str(x))) for x in df['text']]
char_counts_text = df['text'].astype(str).apply(len)

# 'text_b'
token_counts_text_b = [len(enc.encode(str(x))) for x in df['text_b']]
char_counts_text_b = df['text_b'].astype(str).apply(len)

# Combined: text + text_b
token_counts_all = token_counts_text + token_counts_text_b
char_counts_all = list(char_counts_text) + list(char_counts_text_b)

# --- 2) Helper function to compute statistics ---

def compute_stats(values, stat_label=""):
    """
    values: list or Series of numeric values
    Returns a dict with sum, mean, std, and median
    Note: using population std (pstdev).
          If you want sample std, use statistics.stdev().
    """
    return {
        f"{stat_label}sum": sum(values),
        f"{stat_label}mean": statistics.mean(values),
        f"{stat_label}std": statistics.pstdev(values),
        f"{stat_label}median": statistics.median(values)
    }

# --- 3) Build up a list of dicts for our final DataFrame ---

stats_data = []

# A) text column
text_token_stats = compute_stats(token_counts_text, "token_")
text_char_stats = compute_stats(char_counts_text, "char_")
stats_data.append({
    "column": "text",
    **text_token_stats,
    **text_char_stats
})

# B) text_b column
text_b_token_stats = compute_stats(token_counts_text_b, "token_")
text_b_char_stats = compute_stats(char_counts_text_b, "char_")
stats_data.append({
    "column": "text_b",
    **text_b_token_stats,
    **text_b_char_stats
})

# C) text + text_b
all_token_stats = compute_stats(token_counts_all, "token_")
all_char_stats = compute_stats(char_counts_all, "char_")
stats_data.append({
    "column": "text + text_b",
    **all_token_stats,
    **all_char_stats
})

# --- 4) Create a new DataFrame with our statistics ---
stats_df = pd.DataFrame(stats_data)

# --- 5) Print or otherwise use stats_df ---
print(stats_df)


          column  token_sum  token_mean   token_std  token_median   char_sum  \
0           text   44852194  120.553562  103.813916          91.0  235169004   
1         text_b   43253910  116.257701   98.774692          95.0  215112539   
2  text + text_b   88106104  118.405631  101.348399          93.0  450281543   

    char_mean    char_std  char_median  
0  632.086386  525.870946        486.0  
1  578.178693  469.818857        475.0  
2  605.132539  499.361107        479.0  


In [2]:
import os
print(os.getenv('MY_VAR','NA'))

NEW_VAL
