In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')


In [136]:
class RemoveList:
    List = ['申伊航', '李嘉宁']
    def add(self, nameList):
        for name in nameList:
            self.List.append(name)


class EncodingList:
    List  = ['gb18030', 'utf-8', 'ansi', 'GB2312', 'GBK', 'utf-16', 'utf-32', 'utf-8-sig', 'utf-16-le', 'utf-16-be', 'utf-32-le', 'utf-32-be']
    def __init__(self, encoding = 'gb18030'):
        self.encoding = encoding


class CalculateAccuracy:
    def __init__(self, repeated_times_col = '', target_col = '', response_col = ''):
        self.repeated_times_col = repeated_times_col
        self.target_col = target_col
        self.response_col = response_col

    def __call__(self, dataFrame):
        if not isinstance(dataFrame, pd.DataFrame):
            raise TypeError("输入必须是 pandas DataFrame 类型")
        
        # 如果是repeated_times_col
        if self.repeated_times_col:
            if self.repeated_times_col not in dataFrame.columns:
                raise ValueError(f"DataFrame 中必须包含 '{self.repeated_times_col}' 列")
            else:
                correct = 0
                total = len(dataFrame[self.repeated_times_col])
                for index, row in dataFrame.iterrows():
                    if row[self.repeated_times_col] == 0:
                        correct += 1
                accuracy = correct / total if total > 0 else 0
                return accuracy
        
        # 如果是target_col和response_col
        if self.target_col not in dataFrame.columns or self.response_col not in dataFrame.columns :
            raise ValueError(f"DataFrame 中必须包含 '{self.target_col}' 和 '{self.response_col}' 列")

        correct = 0
        total = len(dataFrame)
        for index, row in dataFrame.iterrows():
            if row[self.target_col] == row[self.response_col]:
                correct += 1
        accuracy = correct / total if total > 0 else 0
        return accuracy

class DataList:
    dataMerged = pd.DataFrame()
    numofMale = 0
    numofFemale = 0
    def __init__(self, maxNum = 20, dataFolderName = ''):
        if maxNum % 2 != 0:
            raise ValueError("maxNum must be even")
        self.maxNum = maxNum
        self.maxNumofMale = self.maxNum/2
        self.maxNumofFemale = self.maxNum/2
        self.dataDir = os.path.join(os.path.abspath('.'), dataFolderName)

    def readData(self, needed_num_row = 0, accuracy_threshold = None, gender_col = ''):
        # 尝试encoding
        encodingFlag = 0
        for file in os.listdir(self.dataDir):
            if accuracy_threshold:
                # 如果已经达到最大人数，则跳过
                if self.numofMale + self.numofFemale >= self.maxNum:
                    break
            
            # 读取file
            filePath = os.path.join(self.dataDir, file)
            print(file)
            # 删除包含removeList中名字的文件
            deleted = 0
            for name in removeList.List:
                if name in file:
                    os.remove(filePath)
                    print(f"{file} 包含 {name}，已删除")
                    deleted = 1
                    break
            if deleted: continue
            
            # 选择encoding并读取
            if not encodingFlag:
                for encoding in encodingList.List:
                    try:
                        df = pd.read_csv(filePath, sep=',', encoding=encoding)
                        encodingFlag = True
                        print(f"使用编码 {encoding} 成功")
                        encodingList.encoding = encoding
                        break
                    except UnicodeDecodeError as e:
                        print(f"使用编码 {encoding} 失败: {e}")
            else:
                df = pd.read_csv(filePath, sep=',', encoding = encodingList.encoding)
            
           
            # 需要的行
            df = df.iloc[:needed_num_row]

            if accuracy_threshold:
                # 判断性别
                if df[gender_col].unique() == 'Male':
                    if (self.numofMale+1) > self.maxNumofMale:
                        continue
                    self.numofMale += 1
                elif df[gender_col].unique() == 'Female':
                    if (self.numofFemale+1) > self.maxNumofFemale:
                        continue
                    self.numofFemale += 1
            
            # 判断正确率
            accuracy = calculateAccuracy(df)
            if 'accuracy' not in file:
                newFilename = f"accuracy={accuracy:.2f}_{file}"
                os.rename(os.path.join(self.dataDir, file), os.path.join(self.dataDir, newFilename))
                file = newFilename
            if accuracy_threshold:
                if accuracy > accuracy_threshold:
                    self.dataMerged = pd.concat([self.dataMerged, df],axis=0)
                    print(f"已合并{file}")
                else:
                    os.remove(filePath)
                    print(f"{file} 准确率低于{accuracy_threshold}，已删除")
        print(f"已读取 {self.numofMale} 名男性数据和 {self.numofFemale} 名女性数据")


In [141]:
removeList = RemoveList()
encodingList = EncodingList()
calculateAccuracy = CalculateAccuracy(repeated_times_col='RepeatedTimes')
dataList = DataList(maxNum=30, dataFolderName="rawdata")
dataList.readData(needed_num_row=192, gender_col='SubSex',accuracy_threshold=0.86)
dataList.dataMerged.to_excel('dataMerged.xlsx', index=False)


accuracy=0.86_Sub_3220100780_王雨轩_客体文件回溯实验_DATA.csv
使用编码 gb18030 失败: 'gb18030' codec can't decode byte 0x80 in position 283: illegal multibyte sequence
使用编码 utf-8 失败: 'utf-8' codec can't decode byte 0xcd in position 0: invalid continuation byte
使用编码 ansi 成功
已合并accuracy=0.86_Sub_3220100780_王雨轩_客体文件回溯实验_DATA.csv
accuracy=0.86_Sub_3220103411_周律_客体文件回溯实验_DATA.csv
已合并accuracy=0.86_Sub_3220103411_周律_客体文件回溯实验_DATA.csv
accuracy=0.87_Sub_3220101542_杨憬怡_客体文件回溯实验_DATA.csv
已合并accuracy=0.87_Sub_3220101542_杨憬怡_客体文件回溯实验_DATA.csv
accuracy=0.87_Sub_3220105917_闫昱瑶_客体文件回溯实验_DATA.csv
已合并accuracy=0.87_Sub_3220105917_闫昱瑶_客体文件回溯实验_DATA.csv
accuracy=0.88_Sub_3180103611_周家瑞_客体文件回溯实验_DATA.csv
已合并accuracy=0.88_Sub_3180103611_周家瑞_客体文件回溯实验_DATA.csv
accuracy=0.88_Sub_3210104642_刘一尘_客体文件回溯实验_DATA.csv
已合并accuracy=0.88_Sub_3210104642_刘一尘_客体文件回溯实验_DATA.csv
accuracy=0.88_Sub_3220100474_滕浩宇_客体文件回溯实验_DATA.csv
已合并accuracy=0.88_Sub_3220100474_滕浩宇_客体文件回溯实验_DATA.csv
accuracy=0.88_Sub_3220100633_张嘉凯_客体文件回溯实验_DATA.csv
已合并accurac