In [1]:
import os, re, glob, itertools
import pyarrow as pa, pyarrow.parquet as pq

FOLDER = r"C:\Users\thaip\Desktop\eRisk\erisk2024" 
OUT_PARQUET = "erisk24_all_trec.parquet"

DOC_RE = re.compile(r"<DOC>(.*?)</DOC>", flags=re.DOTALL | re.IGNORECASE)
def get_tag(block, tag):
    m = re.search(rf"<\s*{tag}\s*>(.*?)</\s*{tag}\s*>", block,
                  flags=re.DOTALL | re.IGNORECASE)
    return m.group(1).strip() if m else None

def parse_file(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()
    rows = []
    for blk in DOC_RE.findall(content):
        rows.append((
            get_tag(blk, "DOCNO"),
            get_tag(blk, "PRE"),
            get_tag(blk, "TEXT"),
            get_tag(blk, "POST"),
            os.path.basename(path),
        ))
    return rows

files = glob.glob(os.path.join(FOLDER, "**", "*.trec"), recursive=True)

schema = pa.schema([
    ("docno", pa.string()),
    ("pre", pa.string()),
    ("text", pa.string()),
    ("post", pa.string()),
    ("source_file", pa.string()),
])

writer = None
BATCH = 400  
for i in range(0, len(files), BATCH):
    chunk = files[i:i+BATCH]
    batch_rows = list(itertools.chain.from_iterable(parse_file(p) for p in chunk))
    if not batch_rows: 
        continue
    table = pa.Table.from_pylist(
        [dict(docno=a, pre=b, text=c, post=d, source_file=e) for (a,b,c,d,e) in batch_rows],
        schema=schema,
    )
    if writer is None:
        writer = pq.ParquetWriter(OUT_PARQUET, schema)
    writer.write_table(table)
if writer:
    writer.close()

print("Wrote:", OUT_PARQUET)




Wrote: erisk24_all_trec.parquet


In [2]:
import pandas as pd

df = pd.read_parquet("erisk24_all_trec.parquet")

print(len(df))         
print(df.columns)       
print(df.head())        

15542200
Index(['docno', 'pre', 'text', 'post', 'source_file'], dtype='object')
   docno                                                pre  \
0  0_0_0                                                      
1  0_0_1                                                      
2  0_0_2  I met someone who understands me better than I...   
3  0_0_3  Our friendship thus far is completely platonic...   
4  0_0_4  I tried to break up with my sweet SO, saying t...   

                                                text  \
0       I guess it depends on what cheating entails.   
1  I met someone who understands me better than I...   
2  Our friendship thus far is completely platonic...   
3  I tried to break up with my sweet SO, saying t...   
4  It's been a year but it's noticeable he's stil...   

                                                post source_file  
0  I met someone who understands me better than I...    s_0.trec  
1  Our friendship thus far is completely platonic...    s_0.trec  
2  

In [3]:
df.to_csv("erisk24_clean.csv", index=False, encoding="utf-8")


In [4]:
PARQUET_PATH = "erisk24_all_trec.parquet"  
import duckdb
import pandas as pd

con = duckdb.connect()
con.execute(f"""
CREATE OR REPLACE VIEW docs AS 
SELECT * FROM read_parquet('{PARQUET_PATH}');
""")

overview = con.execute("""
SELECT 
  COUNT(*) AS n_rows,
  SUM(CASE WHEN length(trim(post))>0 THEN 1 ELSE 0 END)*1.0/COUNT(*) AS ratio_post_nonempty,
  AVG(length(text))  AS avg_len_text,
  MEDIAN(length(text)) AS med_len_text
FROM docs
""").df()
display(overview)

peek = con.execute("""
SELECT docno, pre, text, post, source_file
FROM docs
LIMIT 10
""").df()
display(peek)

src_count = con.execute("""
SELECT source_file, COUNT(*) AS n
FROM docs
GROUP BY source_file
ORDER BY n DESC
LIMIT 20
""").df()
display(src_count)

nulls = con.execute("""
SELECT
  SUM(CASE WHEN text IS NULL OR length(trim(text))=0 THEN 1 ELSE 0 END) AS empty_text,
  SUM(CASE WHEN post IS NULL OR length(trim(post))=0 THEN 1 ELSE 0 END) AS empty_post
FROM docs
""").df()
display(nulls)

dups = con.execute("""
SELECT docno, COUNT(*) AS c
FROM docs
GROUP BY docno
HAVING COUNT(*)>1
ORDER BY c DESC
LIMIT 20
""").df()
display(dups)


Unnamed: 0,n_rows,ratio_post_nonempty,avg_len_text,med_len_text
0,15542200,0.944578,93.319207,81.0


Unnamed: 0,docno,pre,text,post,source_file
0,0_0_0,,I guess it depends on what cheating entails.,I met someone who understands me better than I...,s_0.trec
1,0_0_1,,I met someone who understands me better than I...,Our friendship thus far is completely platonic...,s_0.trec
2,0_0_2,I met someone who understands me better than I...,Our friendship thus far is completely platonic...,"I tried to break up with my sweet SO, saying t...",s_0.trec
3,0_0_3,Our friendship thus far is completely platonic...,"I tried to break up with my sweet SO, saying t...",It's been a year but it's noticeable he's stil...,s_0.trec
4,0_0_4,"I tried to break up with my sweet SO, saying t...",It's been a year but it's noticeable he's stil...,"Although most wouldn't consider it cheating, I...",s_0.trec
5,0_0_5,It's been a year but it's noticeable he's stil...,"Although most wouldn't consider it cheating, I...",I will not tell my SO because he would read fu...,s_0.trec
6,0_0_6,"Although most wouldn't consider it cheating, I...",I will not tell my SO because he would read fu...,"I'm trying to work on breaking it off, but my ...",s_0.trec
7,0_0_7,I will not tell my SO because he would read fu...,"I'm trying to work on breaking it off, but my ...",**tl;dr** I'm confused. ),s_0.trec
8,0_0_8,"I'm trying to work on breaking it off, but my ...",**tl;dr** I'm confused. ),,s_0.trec
9,1_0_0,,Just found out my boyfriend of 3 years has bee...,I am completely heartbroken.,s_1.trec


Unnamed: 0,source_file,n
0,s_162.trec,44465
1,s_282.trec,36080
2,s_222.trec,34456
3,s_445.trec,33505
4,s_387.trec,33217
5,s_523.trec,32447
6,s_252.trec,32333
7,s_510.trec,32286
8,s_478.trec,32038
9,s_209.trec,32018


Unnamed: 0,empty_text,empty_post
0,338.0,861379.0


Unnamed: 0,docno,c


In [5]:
peek

Unnamed: 0,docno,pre,text,post,source_file
0,0_0_0,,I guess it depends on what cheating entails.,I met someone who understands me better than I...,s_0.trec
1,0_0_1,,I met someone who understands me better than I...,Our friendship thus far is completely platonic...,s_0.trec
2,0_0_2,I met someone who understands me better than I...,Our friendship thus far is completely platonic...,"I tried to break up with my sweet SO, saying t...",s_0.trec
3,0_0_3,Our friendship thus far is completely platonic...,"I tried to break up with my sweet SO, saying t...",It's been a year but it's noticeable he's stil...,s_0.trec
4,0_0_4,"I tried to break up with my sweet SO, saying t...",It's been a year but it's noticeable he's stil...,"Although most wouldn't consider it cheating, I...",s_0.trec
5,0_0_5,It's been a year but it's noticeable he's stil...,"Although most wouldn't consider it cheating, I...",I will not tell my SO because he would read fu...,s_0.trec
6,0_0_6,"Although most wouldn't consider it cheating, I...",I will not tell my SO because he would read fu...,"I'm trying to work on breaking it off, but my ...",s_0.trec
7,0_0_7,I will not tell my SO because he would read fu...,"I'm trying to work on breaking it off, but my ...",**tl;dr** I'm confused. ),s_0.trec
8,0_0_8,"I'm trying to work on breaking it off, but my ...",**tl;dr** I'm confused. ),,s_0.trec
9,1_0_0,,Just found out my boyfriend of 3 years has bee...,I am completely heartbroken.,s_1.trec


In [9]:
df1 = pd.read_csv("majority_erisk_2024.csv")
df1.head()

Unnamed: 0,1\t0\t98174_0_14\t0
0,1\t0\t506920_0_136\t0
1,1\t0\t347157_0_22\t1
2,1\t0\t363323_4_6\t0
3,1\t0\t496450_0_24\t1
4,1\t0\t305077_0_32\t0


In [10]:
import pandas as pd

IN_TSV  = "majority_erisk_2024.csv"           
OUT_CSV = "majority_erisk_24_clean.csv"      

df = pd.read_csv(IN_TSV, sep="\t", header=None,
                 names=["query", "_drop", "doc_id", "relevant"])

df = df.drop(columns=["_drop"])

df["query"] = pd.to_numeric(df["query"], errors="coerce").astype("Int64")
df["doc_id"] = df["doc_id"].astype(str).str.strip()

df["relevant"] = (
    df["relevant"].astype(str).str.strip().str.lower()
      .map({"1":1, "0":0, "true":1, "false":0, "t":1, "y":1, "yes":1, "n":0, "no":0})
      .astype("Int8")
)

df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print("Wrote:", OUT_CSV)
print(df.head())


Wrote: majority_erisk_24_clean.csv
   query        doc_id  relevant
0      1    98174_0_14         0
1      1  506920_0_136         0
2      1   347157_0_22         1
3      1    363323_4_6         0
4      1   496450_0_24         1


In [11]:
import duckdb

PARQUET_PATH = "erisk24_all_trec.parquet"       # parquet chứa DOCs
QRELS_CSV    = "majority_erisk_24_clean.csv"    # file CSV sạch
OUT_PARQUET  = "dataset2024.parquet"            # output sau join

con = duckdb.connect()

con.execute(f"""
CREATE OR REPLACE VIEW docs AS
SELECT * FROM read_parquet('{PARQUET_PATH}');
""")

con.execute(f"""
CREATE OR REPLACE VIEW qrels_raw AS
SELECT * FROM read_csv_auto('{QRELS_CSV}', HEADER=TRUE);
""")

con.execute("""
CREATE OR REPLACE VIEW qrels AS
SELECT
  CAST(query AS INTEGER) AS query,
  TRIM(CAST(doc_id AS VARCHAR)) AS docno,   -- đổi tên để join
  CASE 
    WHEN UPPER(CAST(relevant AS VARCHAR)) IN ('TRUE','T','1','YES','Y') THEN TRUE
    ELSE FALSE
  END AS relevant
FROM qrels_raw;
""")

check = con.execute("""
SELECT 
  (SELECT COUNT(*) FROM qrels) AS qrels_rows,
  (SELECT COUNT(*) FROM docs)  AS docs_rows,
  (SELECT COUNT(*) FROM qrels q JOIN docs d USING(docno)) AS matched_rows,
  (SELECT COUNT(*) FROM qrels q LEFT JOIN docs d USING(docno) WHERE d.docno IS NULL) AS qrels_not_found
""").df()
print(check)

con.execute(f"""
COPY (
  SELECT
    q.query,
    q.docno,
    q.relevant,
    d.pre,
    d.text,
    d.post,
    d.source_file
  FROM qrels q
  JOIN docs d USING (docno)
) TO '{OUT_PARQUET}' (FORMAT PARQUET, COMPRESSION ZSTD);
""")

print(OUT_PARQUET)


   qrels_rows  docs_rows  matched_rows  qrels_not_found
0       14823   15542200         14823                0
dataset2024.parquet


In [12]:
con = duckdb.connect()
real_path = "dataset2024.parquet"

In [13]:
con.execute(f"""
CREATE OR REPLACE VIEW docs AS 
SELECT * FROM read_parquet('{real_path}');
""")

<duckdb.duckdb.DuckDBPyConnection at 0x259a2ddeb30>

In [14]:
peek = con.execute("""
SELECT *
FROM docs
LIMIT 20
""").df()
display(peek)

Unnamed: 0,query,docno,relevant,pre,text,post,source_file
0,19,98002_0_18,False,I also wouldn't mind a few people to talk to a...,Get my mind off of things.,Thanks for reading.,s_99.trec
1,11,98004_0_2,True,"When I was being prepped for surgery, the nurs...",and I became agitated.,I knew they wanted a sterilized environment fo...,s_99.trec
2,17,98004_0_6,False,She started wheeling me down to surgery and ke...,"Feeling it?""","to which I repeatedly told her ""No"".",s_99.trec
3,18,98006_0_22,False,That's when it will really set in that the who...,That's when he will realize that he really doe...,That he is struggling to even eat 2000 cals in...,s_99.trec
4,18,98006_0_26,False,**TL;DR:** People are skinny because they don'...,If you are skinny it's because you don't eat e...,"Or you have parasites, which you don't.",s_99.trec
5,19,98006_1_43,False,The closest I could be while still making out ...,I could not see anything except two red dots l...,"Not the road, not the lines, just two red dots...",s_99.trec
6,19,98011_0_4,True,I already decided to do all i can and focus on...,But I can find no clear mind.,She is gone since 2 Weeks now and i still cry ...,s_99.trec
7,19,98011_0_15,False,I fucked up my tests.,Dont know how to focus now.,,s_99.trec
8,16,98013_0_0,True,,My sleep paralysis.,it seems to come at the most random nights.,s_99.trec
9,16,98013_0_6,False,All that existed was my eyes and the deafening...,When i came to there were three very intense f...,I shit you not i regressed to the childhood na...,s_99.trec


In [15]:
con.execute("SELECT COUNT(*) AS n_rows FROM docs").df()


Unnamed: 0,n_rows
0,14823


In [16]:
OUT_CSV = "dataset2024.csv"

con.execute(f"""
COPY (
  SELECT *
  FROM docs
) TO '{OUT_CSV}' (HEADER, DELIMITER ',', QUOTE '"', ESCAPE '"');
""")

<duckdb.duckdb.DuckDBPyConnection at 0x259a2ddeb30>

In [17]:
df_1 = pd.read_csv("dataset2024.csv")
df_1.head()

Unnamed: 0,query,docno,relevant,pre,text,post,source_file
0,19,98002_0_18,False,I also wouldn't mind a few people to talk to a...,Get my mind off of things.,Thanks for reading.,s_99.trec
1,11,98004_0_2,True,"When I was being prepped for surgery, the nurs...",and I became agitated.,I knew they wanted a sterilized environment fo...,s_99.trec
2,17,98004_0_6,False,She started wheeling me down to surgery and ke...,"Feeling it?""","to which I repeatedly told her ""No"".",s_99.trec
3,18,98006_0_22,False,That's when it will really set in that the who...,That's when he will realize that he really doe...,That he is struggling to even eat 2000 cals in...,s_99.trec
4,18,98006_0_26,False,**TL;DR:** People are skinny because they don'...,If you are skinny it's because you don't eat e...,"Or you have parasites, which you don't.",s_99.trec
