In [16]:
import os
from collections import defaultdict
import pandas as pd

In [18]:
# 设置文件夹路径
question_dir = 'D:\\VQA\\validation\\Questions'
answers_dir = 'D:\\VQA\\validation\\Answers'

# 调整Pandas显示设置
pd.set_option('display.max_rows', None)  # 显示所有行
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('display.max_colwidth', None)  # 显示完整列内容
pd.set_option('display.width', 1000)  # 设置显示宽度


In [20]:
def count_questions(directory):
    question_counts = defaultdict(int)
    error_files = []  # 用来收集出现解码错误的文件列表
    
    # 检查目录是否存在
    if not os.path.exists(directory):
        print(f"Directory does not exist: {directory}")
        return pd.DataFrame(), pd.DataFrame()
    
    # 遍历指定目录下的所有文件
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        # 确保是文件而非目录
        if os.path.isfile(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read().strip()  # 读取整个文件内容作为一个问题
                    question_counts[content] += 1
            except UnicodeDecodeError as e:
                # 添加到错误文件列表
                error_files.append((filename, str(e)))

    # 转换结果为DataFrame
    questions_df = pd.DataFrame(list(question_counts.items()), columns=['Question', 'Count'])
    error_files_df = pd.DataFrame(error_files, columns=['File', 'Error'])

    return questions_df, error_files_df

# 调用函数并打印结果
results_df, errors_df = count_questions(question_dir)

if not results_df.empty:
    print("Question Counts:")
    print(results_df.sort_values(by='Count', ascending=False))
else:
    print("No data found or directory is empty.")

if not errors_df.empty:
    print("\nError Files:")
    print(errors_df)
else:
    print("No decoding errors found.")

Question Counts:
                                                                                                                                    Question  Count
15                                       How many cars are waiting to turn left in the bottom lane, moving from the bottom towards the left?      6
44                                   Are there many cars driving on the road right now? Not including the cars waiting at the traffic light.      5
20                                                                                     How many pedestrians are crossing the zebra crossing?      5
8                                                         In what direction is the car moving on the road? Excluding those that are waiting.      3
41                                                                                   Are there many people and motorcycles on the crosswalk?      3
14                                                                     How many cars are going 

In [12]:
#检查问题在哪些文件中出现
def find_files_by_question(directory, target_question):
    # 问题
    target_question = "Where are pedestrians crossing the road?"

    matching_files = []
    # 检查目录是否存在
    if not os.path.exists(directory):
        print(f"Directory does not exist: {directory}")
        return matching_files
    
    # 遍历指定目录下的所有文件
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        # 确保是文件而非目录
        if os.path.isfile(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read().strip()  # 读取整个文件内容
                    if content == target_question:  # 比较文件内容与目标问题
                        matching_files.append(filename)
            except UnicodeDecodeError:
                print(f"Could not decode file {file_path} using UTF-8.")

    return matching_files

# 调用函数并打印结果
matching_files = find_files_by_question(question_dir, target_question)
if matching_files:
    print("Matching Files:")
    for file in matching_files:
        print(file)
else:
    print("No files match the provided question.")

Matching Files:
148709_sj8fas2e152d20211124air_420_1637216131_1637218737_129_obstacle.txt
148709_sj8fas2e152d20211124air_420_1637216131_1637218737_31_obstacle.txt
148709_sj8fas2e152d20211124air_420_1637216131_1637218737_94_obstacle.txt


In [10]:
#替换问题
def find_and_replace_in_files(directory, target_question, new_content):
    # 输入问题和新内容
    target_question = "Where are pedestrians crossing the road"
    new_content = "Where are pedestrians crossing the road?"

    matching_files = []
    modified_files = []

    # 检查目录是否存在
    if not os.path.exists(directory):
        print(f"Directory does not exist: {directory}")
        return matching_files, modified_files

    # 遍历指定目录下的所有文件
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        # 确保是文件而非目录
        if os.path.isfile(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read().strip()  # 读取整个文件内容
                    if content == target_question:  # 比较文件内容与目标问题
                        matching_files.append(filename)

                        # 替换文件内容
                        with open(file_path, 'w', encoding='utf-8') as file_to_write:
                            file_to_write.write(new_content)
                            modified_files.append(filename)
            except UnicodeDecodeError:
                print(f"Could not decode file {file_path} using UTF-8.")
            except IOError as e:
                print(f"Error while writing to file {file_path}: {e}")

    return matching_files, modified_files

# 调用函数并打印结果
matching_files, modified_files = find_and_replace_in_files(question_dir, target_question, new_content)
if matching_files:
    print("Matching Files:")
    for file in matching_files:
        print(file)
    print("\nModified Files:")
    for file in modified_files:
        print(file)
else:
    print("No files match the provided question.")

Matching Files:
148709_sj8fas2e152d20211124air_420_1637216131_1637218737_31_obstacle.txt

Modified Files:
148709_sj8fas2e152d20211124air_420_1637216131_1637218737_31_obstacle.txt
