In [7]:
import os
import glob
import numpy as np
import jieba
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.svm import SVC
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score



In [8]:
def load_data(corpus_dir, K, token_mode='word', sample_size=1000):
    corpus = []
    labels = []
    txt_files = glob.glob(os.path.join(corpus_dir, "*.txt"))
    for file in txt_files:
        with open(file, 'r', encoding='gb18030', errors='ignore') as f:
            text = f.read().strip()
            if token_mode == 'word':
                tokens = list(jieba.cut(text))
            elif token_mode == 'char':
                tokens = list(text)
            else:
                raise ValueError("token_mode must be 'word' or 'char'")
            # Split into paragraphs of K tokens
            num_paragraphs = len(tokens) // K
            for i in range(num_paragraphs):
                paragraph = tokens[i*K : (i+1)*K]
                corpus.append(' '.join(paragraph))
                labels.append(os.path.basename(file).replace('.txt', ''))
    # Convert to numpy arrays
    corpus = np.array(corpus)
    labels = np.array(labels)
    # Random sample
    if len(corpus) < sample_size:
        raise ValueError(f"Not enough paragraphs. Only {len(corpus)} available.")
    indices = np.random.choice(len(corpus), size=sample_size, replace=False)
    return corpus[indices], labels[indices]

def evaluate(corpus, labels, T, token_mode='word', n_splits=10, test_size=100):
    cv = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)
    accuracies = []
    for train_idx, test_idx in cv.split(corpus):
        X_train, X_test = corpus[train_idx], corpus[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]
        
        # Vectorization
        vectorizer = CountVectorizer()
        X_train_counts = vectorizer.fit_transform(X_train)
        X_test_counts = vectorizer.transform(X_test)
        
        # LDA
        lda = LatentDirichletAllocation(n_components=T, random_state=42)
        X_train_lda = lda.fit_transform(X_train_counts)
        X_test_lda = lda.transform(X_test_counts)
        
        # Classification
        clf = SVC(kernel='linear', random_state=42)
        clf.fit(X_train_lda, y_train)
        y_pred = clf.predict(X_test_lda)
        accuracies.append(accuracy_score(y_test, y_pred))
    return np.mean(accuracies)

def main():
    corpus_dir = "D:/课程/大四/大四下/自然语言处理/第二次作业/jyxstxtqj_downcc.com/"
    results = []
    
    # 实验参数配置（示例用较小参数组合）
    K_values = [20, 100, 500, 1000, 3000]  # 完整实验需包含所有指定K值
    T_values = [5, 10, 20, 50, 100]
    token_modes = ['word', 'char']
    
    for K in K_values:
        for token_mode in token_modes:
            try:
                print(f"\nProcessing K={K}, mode={token_mode}...")
                corpus, labels = load_data(corpus_dir, K=K, token_mode=token_mode)
                for T in T_values:
                    acc = evaluate(corpus, labels, T=T, token_mode=token_mode)
                    results.append({'K': K, 'Mode': token_mode, 'T': T, 'Accuracy': acc})
                    print(f"K={K}, Mode={token_mode}, T={T} => Accuracy: {acc:.4f}")
            except Exception as e:
                print(f"Error with K={K}, mode={token_mode}: {str(e)}")
    
    # 结果展示与分析
    df = pd.DataFrame(results)
    print("\nResults Summary:")
    print(df)
    
    # 结果讨论
    print("\nAnalysis:")
    print("1. 主题数量T的影响：随着T增加，准确率可能先升后降，最佳T值需平衡信息量与噪声。")
    print("2. 分词vs分字：分词通常携带更多语义信息，但分字对未登录词更鲁棒。")
    print("3. 段落长度K：短文本（K小）信息不足，长文本（K大）可能包含多主题，需适中长度。")



In [9]:
if __name__ == '__main__':
    main()


Processing K=20, mode=word...
K=20, Mode=word, T=5 => Accuracy: 0.1420
K=20, Mode=word, T=10 => Accuracy: 0.1400
K=20, Mode=word, T=20 => Accuracy: 0.1400
K=20, Mode=word, T=50 => Accuracy: 0.1620
K=20, Mode=word, T=100 => Accuracy: 0.1760

Processing K=20, mode=char...
Error with K=20, mode=char: empty vocabulary; perhaps the documents only contain stop words

Processing K=100, mode=word...
K=100, Mode=word, T=5 => Accuracy: 0.1700
K=100, Mode=word, T=10 => Accuracy: 0.1730
K=100, Mode=word, T=20 => Accuracy: 0.1860
K=100, Mode=word, T=50 => Accuracy: 0.2060
K=100, Mode=word, T=100 => Accuracy: 0.2250

Processing K=100, mode=char...
Error with K=100, mode=char: empty vocabulary; perhaps the documents only contain stop words

Processing K=500, mode=word...
K=500, Mode=word, T=5 => Accuracy: 0.2660
K=500, Mode=word, T=10 => Accuracy: 0.3080
K=500, Mode=word, T=20 => Accuracy: 0.3500
K=500, Mode=word, T=50 => Accuracy: 0.3640
K=500, Mode=word, T=100 => Accuracy: 0.3810

Processing K=500