# SVM

In [17]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np


folder_1 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\1"
folder_2 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\2"


def load_documents_from_folder(folder_path):
    documents = []
    labels = []
    for txt_file in glob.glob(os.path.join(folder_path, "*.txt")):
        with open(txt_file, 'r', encoding='utf-8') as file:
          
            content = file.read().strip().replace('\n', ' ')  
            documents.append(content)
           
            label = os.path.basename(txt_file)[0]  
            labels.append(label)
    return documents, labels


docs_1, labels_1 = load_documents_from_folder(folder_1)
docs_2, labels_2 = load_documents_from_folder(folder_2)


documents = docs_1 + docs_2
labels = labels_1 + labels_2


print(f"Total number of documents: {len(documents)}")  


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)


print(f"Shape of the TF-IDF matrix: {X.shape}")  


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


svm_classifier = SVC(kernel='linear', random_state=42)


svm_classifier.fit(X_train, y_train)


y_pred = svm_classifier.predict(X_test)


report = classification_report(y_test, y_pred, output_dict=True)


for label, metrics in report.items():
    if label != 'accuracy':
        for metric, value in metrics.items():
            if isinstance(value, float):
                metrics[metric] = round(value, 3)


print("Classification Report (Rounded to 3 decimal places):")
for label, metrics in report.items():
    if label != 'accuracy':
        print(f"\nLabel: {label}")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
    

new_document = ["Python is great for data science"]  
X_new = vectorizer.transform(new_document)
new_pred = svm_classifier.predict(X_new)
print("Predicted Category for the new document:", new_pred[0])


Total number of documents: 1333
Shape of the TF-IDF matrix: (1333, 17101)
Classification Report (Rounded to 3 decimal places):

Label: A
  precision: 0.8
  recall: 0.571
  f1-score: 0.667
  support: 21.0

Label: B
  precision: 0.82
  recall: 0.891
  f1-score: 0.854
  support: 46.0

Label: C
  precision: 0.877
  recall: 0.781
  f1-score: 0.826
  support: 73.0

Label: D
  precision: 0.8
  recall: 0.8
  f1-score: 0.8
  support: 85.0

Label: E
  precision: 0.469
  recall: 0.19
  f1-score: 0.27
  support: 79.0

Label: F
  precision: 0.562
  recall: 0.896
  f1-score: 0.691
  support: 96.0

Label: macro avg
  precision: 0.721
  recall: 0.688
  f1-score: 0.685
  support: 400.0

Label: weighted avg
  precision: 0.694
  recall: 0.698
  f1-score: 0.673
  support: 400.0
Predicted Category for the new document: F


# NB

In [18]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import numpy as np


folder_1 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\1"
folder_2 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\2"


def load_documents_from_folder(folder_path):
    documents = []
    labels = []
    for txt_file in glob.glob(os.path.join(folder_path, "*.txt")):
        with open(txt_file, 'r', encoding='utf-8') as file:
           
            content = file.read().strip().replace('\n', ' ')  
            documents.append(content)
            
            label = os.path.basename(txt_file)[0]  
            labels.append(label)
    return documents, labels


docs_1, labels_1 = load_documents_from_folder(folder_1)
docs_2, labels_2 = load_documents_from_folder(folder_2)


documents = docs_1 + docs_2
labels = labels_1 + labels_2


print(f"Total number of documents: {len(documents)}") 


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)


print(f"Shape of the TF-IDF matrix: {X.shape}") 


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


nb_classifier = MultinomialNB()


nb_classifier.fit(X_train, y_train)


y_pred = nb_classifier.predict(X_test)


report = classification_report(y_test, y_pred, output_dict=True)


for label, metrics in report.items():
    if label != 'accuracy':  
        for metric, value in metrics.items():
            if isinstance(value, float):
                metrics[metric] = round(value, 3)


print("Classification Report (Rounded to 3 decimal places):")
for label, metrics in report.items():
    if label != 'accuracy':
        print(f"\nLabel: {label}")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")


new_document = ["Python is great for data science"] 
X_new = vectorizer.transform(new_document)
new_pred = nb_classifier.predict(X_new)
print("Predicted Category for the new document:", new_pred[0])


Total number of documents: 1333
Shape of the TF-IDF matrix: (1333, 17101)
Classification Report (Rounded to 3 decimal places):

Label: A
  precision: 0.0
  recall: 0.0
  f1-score: 0.0
  support: 21.0

Label: B
  precision: 0.667
  recall: 0.217
  f1-score: 0.328
  support: 46.0

Label: C
  precision: 0.537
  recall: 0.699
  f1-score: 0.607
  support: 73.0

Label: D
  precision: 0.586
  recall: 0.918
  f1-score: 0.716
  support: 85.0

Label: E
  precision: 0.571
  recall: 0.051
  f1-score: 0.093
  support: 79.0

Label: F
  precision: 0.56
  recall: 0.875
  f1-score: 0.683
  support: 96.0

Label: macro avg
  precision: 0.487
  recall: 0.46
  f1-score: 0.404
  support: 400.0

Label: weighted avg
  precision: 0.547
  recall: 0.568
  f1-score: 0.483
  support: 400.0
Predicted Category for the new document: F


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# KNN

In [19]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import numpy as np


folder_1 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\1"
folder_2 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\2"


def load_documents_from_folder(folder_path):
    documents = []
    labels = []
    for txt_file in glob.glob(os.path.join(folder_path, "*.txt")):
        with open(txt_file, 'r', encoding='utf-8') as file:
         
            content = file.read().strip().replace('\n', ' ')  
            documents.append(content)
           
            label = os.path.basename(txt_file)[0]  
            labels.append(label)
    return documents, labels


docs_1, labels_1 = load_documents_from_folder(folder_1)
docs_2, labels_2 = load_documents_from_folder(folder_2)


documents = docs_1 + docs_2
labels = labels_1 + labels_2


print(f"Total number of documents: {len(documents)}")  


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)


print(f"Shape of the TF-IDF matrix: {X.shape}")  


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


knn_classifier = KNeighborsClassifier(n_neighbors=5)


knn_classifier.fit(X_train, y_train)


y_pred = knn_classifier.predict(X_test)


report = classification_report(y_test, y_pred, output_dict=True)


for label, metrics in report.items():
    if label != 'accuracy': 
        for metric, value in metrics.items():
            if isinstance(value, float):
                metrics[metric] = round(value, 3)


print("Classification Report (Rounded to 3 decimal places):")
for label, metrics in report.items():
    if label != 'accuracy':
        print(f"\nLabel: {label}")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")


new_document = ["Python is great for data science"] 
X_new = vectorizer.transform(new_document)
new_pred = knn_classifier.predict(X_new)
print("Predicted Category for the new document:", new_pred[0])


Total number of documents: 1333
Shape of the TF-IDF matrix: (1333, 17101)
Classification Report (Rounded to 3 decimal places):

Label: A
  precision: 0.293
  recall: 0.81
  f1-score: 0.43
  support: 21.0

Label: B
  precision: 0.369
  recall: 0.522
  f1-score: 0.432
  support: 46.0

Label: C
  precision: 0.514
  recall: 0.521
  f1-score: 0.517
  support: 73.0

Label: D
  precision: 0.455
  recall: 0.647
  f1-score: 0.534
  support: 85.0

Label: E
  precision: 0.293
  recall: 0.152
  f1-score: 0.2
  support: 79.0

Label: F
  precision: 0.659
  recall: 0.281
  f1-score: 0.394
  support: 96.0

Label: macro avg
  precision: 0.43
  recall: 0.489
  f1-score: 0.418
  support: 400.0

Label: weighted avg
  precision: 0.464
  recall: 0.432
  f1-score: 0.414
  support: 400.0
Predicted Category for the new document: F


# LR

In [20]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np


folder_1 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\1"
folder_2 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\2"


def load_documents_from_folder(folder_path):
    documents = []
    labels = []
    for txt_file in glob.glob(os.path.join(folder_path, "*.txt")):
        with open(txt_file, 'r', encoding='utf-8') as file:
            
            content = file.read().strip().replace('\n', ' ') 
            documents.append(content)
            
            label = os.path.basename(txt_file)[0]  
            labels.append(label)
    return documents, labels


docs_1, labels_1 = load_documents_from_folder(folder_1)
docs_2, labels_2 = load_documents_from_folder(folder_2)


documents = docs_1 + docs_2
labels = labels_1 + labels_2


print(f"Total number of documents: {len(documents)}") 


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)


print(f"Shape of the TF-IDF matrix: {X.shape}")  


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


log_reg_classifier = LogisticRegression(max_iter=1000, random_state=42)


log_reg_classifier.fit(X_train, y_train)


y_pred = log_reg_classifier.predict(X_test)


report = classification_report(y_test, y_pred, output_dict=True)


for label, metrics in report.items():
    if label != 'accuracy':  
        for metric, value in metrics.items():
            if isinstance(value, float):
                metrics[metric] = round(value, 3)


print("Classification Report (Rounded to 3 decimal places):")
for label, metrics in report.items():
    if label != 'accuracy':
        print(f"\nLabel: {label}")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")


new_document = ["Python is great for data science"]  
X_new = vectorizer.transform(new_document)
new_pred = log_reg_classifier.predict(X_new)
print("Predicted Category for the new document:", new_pred[0])


Total number of documents: 1333
Shape of the TF-IDF matrix: (1333, 17101)
Classification Report (Rounded to 3 decimal places):

Label: A
  precision: 1.0
  recall: 0.333
  f1-score: 0.5
  support: 21.0

Label: B
  precision: 0.861
  recall: 0.674
  f1-score: 0.756
  support: 46.0

Label: C
  precision: 0.794
  recall: 0.74
  f1-score: 0.766
  support: 73.0

Label: D
  precision: 0.79
  recall: 0.753
  f1-score: 0.771
  support: 85.0

Label: E
  precision: 0.5
  recall: 0.089
  f1-score: 0.151
  support: 79.0

Label: F
  precision: 0.474
  recall: 0.958
  f1-score: 0.634
  support: 96.0

Label: macro avg
  precision: 0.737
  recall: 0.591
  f1-score: 0.596
  support: 400.0

Label: weighted avg
  precision: 0.677
  recall: 0.637
  f1-score: 0.599
  support: 400.0
Predicted Category for the new document: F


# CART

In [21]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import numpy as np


folder_1 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\1"
folder_2 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\2"


def load_documents_from_folder(folder_path):
    documents = []
    labels = []
    for txt_file in glob.glob(os.path.join(folder_path, "*.txt")):
        with open(txt_file, 'r', encoding='utf-8') as file:
            
            content = file.read().strip().replace('\n', ' ')  
            documents.append(content)
           
            label = os.path.basename(txt_file)[0] 
            labels.append(label)
    return documents, labels


docs_1, labels_1 = load_documents_from_folder(folder_1)
docs_2, labels_2 = load_documents_from_folder(folder_2)


documents = docs_1 + docs_2
labels = labels_1 + labels_2


print(f"Total number of documents: {len(documents)}")  


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)


print(f"Shape of the TF-IDF matrix: {X.shape}") 


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


decision_tree_classifier = DecisionTreeClassifier(random_state=42)


decision_tree_classifier.fit(X_train, y_train)


y_pred = decision_tree_classifier.predict(X_test)


report = classification_report(y_test, y_pred, output_dict=True)


for label, metrics in report.items():
    if label != 'accuracy':  
        for metric, value in metrics.items():
            if isinstance(value, float):
                metrics[metric] = round(value, 3)


print("Classification Report (Rounded to 3 decimal places):")
for label, metrics in report.items():
    if label != 'accuracy':
        print(f"\nLabel: {label}")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")


new_document = ["Python is great for data science"]  
X_new = vectorizer.transform(new_document)
new_pred = decision_tree_classifier.predict(X_new)
print("Predicted Category for the new document:", new_pred[0])


Total number of documents: 1333
Shape of the TF-IDF matrix: (1333, 17101)
Classification Report (Rounded to 3 decimal places):

Label: A
  precision: 0.3
  recall: 0.286
  f1-score: 0.293
  support: 21.0

Label: B
  precision: 0.524
  recall: 0.478
  f1-score: 0.5
  support: 46.0

Label: C
  precision: 0.444
  recall: 0.329
  f1-score: 0.378
  support: 73.0

Label: D
  precision: 0.486
  recall: 0.6
  f1-score: 0.537
  support: 85.0

Label: E
  precision: 0.508
  recall: 0.418
  f1-score: 0.458
  support: 79.0

Label: F
  precision: 0.526
  recall: 0.625
  f1-score: 0.571
  support: 96.0

Label: macro avg
  precision: 0.465
  recall: 0.456
  f1-score: 0.456
  support: 400.0

Label: weighted avg
  precision: 0.487
  recall: 0.49
  f1-score: 0.484
  support: 400.0
Predicted Category for the new document: F


# RF

In [22]:
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np


folder_1 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\1"
folder_2 = r"C:\Users\86178\Desktop\TF-IDF基线模型评估语料\2"


def load_documents_from_folder(folder_path):
    documents = []
    labels = []
    for txt_file in glob.glob(os.path.join(folder_path, "*.txt")):
        with open(txt_file, 'r', encoding='utf-8') as file:
         
            content = file.read().strip().replace('\n', ' ')  
            documents.append(content)
           
            label = os.path.basename(txt_file)[0] 
            labels.append(label)
    return documents, labels


docs_1, labels_1 = load_documents_from_folder(folder_1)
docs_2, labels_2 = load_documents_from_folder(folder_2)


documents = docs_1 + docs_2
labels = labels_1 + labels_2


label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)


X_train, X_test, y_train, y_test = train_test_split(X, labels_encoded, test_size=0.3, random_state=42)


rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)


rf_classifier.fit(X_train, y_train)


y_pred = rf_classifier.predict(X_test)


report = classification_report(y_test, y_pred, output_dict=True)


for label, metrics in report.items():
    if label != 'accuracy':  
        for metric, value in metrics.items():
            if isinstance(value, float):
                metrics[metric] = round(value, 3)


print("Classification Report (Rounded to 3 decimal places):")
for label, metrics in report.items():
    if label != 'accuracy':
        print(f"\nLabel: {label}")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")


new_document = ["Python is great for data science"] 
X_new = vectorizer.transform(new_document)
new_pred = rf_classifier.predict(X_new)
print("Predicted Category for the new document:", label_encoder.inverse_transform(new_pred)[0])


Classification Report (Rounded to 3 decimal places):

Label: 0
  precision: 0.288
  recall: 0.81
  f1-score: 0.425
  support: 21.0

Label: 1
  precision: 0.542
  recall: 0.696
  f1-score: 0.61
  support: 46.0

Label: 2
  precision: 0.72
  recall: 0.493
  f1-score: 0.585
  support: 73.0

Label: 3
  precision: 0.733
  recall: 0.776
  f1-score: 0.754
  support: 85.0

Label: 4
  precision: 0.75
  recall: 0.19
  f1-score: 0.303
  support: 79.0

Label: 5
  precision: 0.566
  recall: 0.719
  f1-score: 0.633
  support: 96.0

Label: macro avg
  precision: 0.6
  recall: 0.614
  f1-score: 0.552
  support: 400.0

Label: weighted avg
  precision: 0.649
  recall: 0.588
  f1-score: 0.571
  support: 400.0
Predicted Category for the new document: A
