In [6]:
import pandas as pd

def markdown_to_csv(markdown_file_path):
    # 读取Markdown文件
    with open(markdown_file_path, 'r') as f:
        lines = f.readlines()

    # 从Markdown表格中提取数据
    data = []
    for line in lines:
        if '|' in line:
            row = line.split('|')[1:-1]  # 去掉开头和结尾的 '|'
            row = [item.strip() for item in row]  # 去掉每个元素的首尾空白符
            data.append(row)

    # 转换为DataFrame
    df = pd.DataFrame(data[1:], columns=data[0])

    # 获取csv文件名
    csv_file_name = markdown_file_path.rsplit('.', 1)[0] + '.csv'

    # 写入CSV文件
    df.to_csv(csv_file_name, index=False)

    print(f"Converted {markdown_file_path} to {csv_file_name}")
markdown_to_csv("ICLR2019data.md")

Converted ICLR2019data.md to ICLR2019data.csv


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def mark_best_papers(csv_file_path, best_papers_txt_file_path):
    # 读取CSV文件到一个DataFrame
    df = pd.read_csv(csv_file_path)

    # 读取Best Paper标题到一个列表，并过滤掉空行
    with open(best_papers_txt_file_path, 'r') as f:
        best_papers = [line for line in f.read().splitlines() if line]

    # 创建一个新的列"Best Paper"，并将其初始值设为0
    df['Best Paper'] = 0

    # 创建一个TF-IDF向量器
    vectorizer = TfidfVectorizer()

    # 对所有标题（包括CSV文件中的标题和best paper的标题）进行TF-IDF转换
    all_titles = df['Title'].tolist() + best_papers
    tfidf_matrix = vectorizer.fit_transform(all_titles)

    # 计算每个best paper标题与所有CSV文件标题的余弦相似性
    for i in range(len(df), len(all_titles)):
        cosine_similarities = cosine_similarity(tfidf_matrix[i], tfidf_matrix[:len(df)]).flatten()
        # 找到最相似的标题的索引
        most_similar_index = cosine_similarities.argmax()
        # 将相应的行标记为best paper
        df.loc[most_similar_index, 'Best Paper'] = 1

    # 将结果保存到CSV文件中
    df.to_csv(csv_file_path, index=False)
mark_best_papers("ICLR2019data.csv","bestPaperICLR2019.txt")