In [8]:
import pandas as pd
import glob
import os
import numpy as np

INPUT_DIR = './data'  
OUTPUT_DIR = './data'
NUM_NOTES = 50000

In [9]:
notes_file = os.path.join(INPUT_DIR, 'notes-00000.tsv')
df_notes = pd.read_csv(notes_file, sep='\t')

df_notes_subset = df_notes.head(NUM_NOTES).copy()

notes_out_path = os.path.join(OUTPUT_DIR, 'notes.tsv')
df_notes_subset.to_csv(notes_out_path, sep='\t', index=False)

valid_note_ids = set(df_notes_subset['noteId'])

  df_notes = pd.read_csv(notes_file, sep='\t')


In [None]:
# Processing Ratings

rating_files = sorted(glob.glob(os.path.join(INPUT_DIR, 'ratings-*.tsv')))

ratings_out_path = os.path.join(OUTPUT_DIR, 'ratings.tsv')
first_file = True
total_ratings = 0

for file_path in rating_files:
    filename = os.path.basename(file_path)
    print(f"Scanning {filename}...", end=" ")
    
    df_chunk = pd.read_csv(file_path, sep='\t')
    
    # Only keep ratings that belong to our 50k notes
    df_filtered = df_chunk[df_chunk['noteId'].isin(valid_note_ids)]
    
    count = len(df_filtered)
    print(f"Found {count} matching ratings.")
    
    if count > 0:
        # If it's the very first file, write the header. 
        # For subsequent files, skip the header so don't repeat it.
        mode = 'w' if first_file else 'a'
        header = first_file
        
        df_filtered.to_csv(ratings_out_path, sep='\t', mode=mode, header=header, index=False)
        
        total_ratings += count
        first_file = False

Scanning ratings-00000.tsv... Found 306642 matching ratings.
Scanning ratings-00001.tsv... Found 304007 matching ratings.
Scanning ratings-00002.tsv... Found 299212 matching ratings.
Scanning ratings-00003.tsv... Found 304328 matching ratings.
Scanning ratings-00004.tsv... Found 300978 matching ratings.
Scanning ratings-00005.tsv... Found 300147 matching ratings.
Scanning ratings-00006.tsv... Found 300294 matching ratings.
Scanning ratings-00007.tsv... Found 296118 matching ratings.
Scanning ratings-00008.tsv... Found 297601 matching ratings.
Scanning ratings-00009.tsv... Found 295242 matching ratings.
Scanning ratings-00010.tsv... Found 106290 matching ratings.
Scanning ratings-00011.tsv... Found 102946 matching ratings.
Scanning ratings-00012.tsv... Found 100956 matching ratings.
Scanning ratings-00013.tsv... Found 99264 matching ratings.
Scanning ratings-00014.tsv... Found 102503 matching ratings.
Scanning ratings-00015.tsv... Found 101207 matching ratings.
Scanning ratings-00016.ts

In [None]:
# Calculate Statistics

df_final_ratings = pd.read_csv(ratings_out_path, sep='\t')

ratings_per_note = df_final_ratings.groupby('noteId').size()
    
print(f"Total Notes in subset: {len(df_notes_subset)}")
print(f"Total Ratings collected: {len(df_final_ratings)}")
print(f"Median ratings per note: {ratings_per_note.median()}")
print(f"Mean ratings per note: {ratings_per_note.mean():.2f}")
    


Total Notes in subset: 50000
Total Ratings collected: 4010075
Median ratings per note: 26.0
Mean ratings per note: 90.12


In [23]:
import os
import sys
import subprocess

# --- CONFIGURATION ---
BASE_DIR = os.path.abspath(".")
REPO_DIR = os.path.join(BASE_DIR, "communitynotes")
SRC_DIR = os.path.join(REPO_DIR, "scoring", "src")

# Check your data folder name! (Use "./subset_data" if that's where you saved the files)
DATA_DIR = os.path.join(BASE_DIR, "data") 
NOTES_PATH = os.path.join(DATA_DIR, "notes.tsv")
RATINGS_PATH = os.path.join(DATA_DIR, "ratings.tsv")

OUTPUT_DIR = os.path.join(BASE_DIR, "data")
LOG_FILE = "run_logs.txt"

# --- RUN ALGORITHM ---
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1. Setup the environment so Python finds the 'scoring' library
my_env = os.environ.copy()
my_env["PYTHONPATH"] = SRC_DIR + os.pathsep + my_env.get("PYTHONPATH", "")

print("\n" + "="*50)
print(f"STARTING ALGORITHM")
print(f"Input: {DATA_DIR}")
print(f"Output: {OUTPUT_DIR}")
print("="*50)

# 2. Use subprocess to run as a module (-m)
# This fixes the "ImportError: attempted relative import"
command = [
    sys.executable, "-u", "-m", "scoring.run_scoring",
    "--notes", NOTES_PATH,
    "--ratings", RATINGS_PATH,
    "--outdir", OUTPUT_DIR
]

print(f"Executing: {' '.join(command)}")

# 3. Run and save logs
with open(LOG_FILE, "w") as f:
    process = subprocess.run(command, env=my_env, stdout=f, stderr=subprocess.STDOUT)

if process.returncode == 0:
    print("\nSUCCESS! Algorithm finished.")
    print(f"Logs saved to: {LOG_FILE}")
    
    # 4. Find Loss
    print("\n--- MATRIX FACTORIZATION LOSS ---")
    found = False
    with open(LOG_FILE, 'r') as log:
        for line in log:
            if "loss" in line.lower() or "cost" in line.lower():
                print(line.strip())
                found = True
    if not found:
        print("Loss not found automatically. Please check run_logs.txt.")
else:
    print(f"\nFAILURE. Exit code: {process.returncode}")
    print(f"Check {LOG_FILE} for error details.")


STARTING ALGORITHM
Input: c:\Users\yuanxili\code\494-algo-representations\data
Output: c:\Users\yuanxili\code\494-algo-representations\data
Executing: C:\Users\yuanxili\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -u -m scoring.run_scoring --notes c:\Users\yuanxili\code\494-algo-representations\data\notes.tsv --ratings c:\Users\yuanxili\code\494-algo-representations\data\ratings.tsv --outdir c:\Users\yuanxili\code\494-algo-representations\data

FAILURE. Exit code: 1
Check run_logs.txt for error details.
