In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# 读取训练数据
def load_train_data(folder_path):
    categories = ['经济', '军事', '科技', '社会', '体育', '文化', '政治']
    data = []
    labels = []
    for category in categories:
        file_path = os.path.join(folder_path, f'{category}train.csv')
        if os.path.exists(file_path):
            df = pd.read_csv(file_path, on_bad_lines='skip')  # 跳过有问题的行
            df['Content'] = df['Content'].fillna('')
            data.extend(df['Content'].tolist())
            labels.extend([category] * len(df))
    return data, labels


In [4]:
# 读取预测数据
def load_predict_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [line.strip() for line in f.readlines()]
    return data

In [5]:
# 训练模型
def train_model(data, labels):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    y = labels
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_pred))
    print(classification_report(y_val, y_pred))
    return model, vectorizer

In [6]:
# 预测
def predict(model, vectorizer, predict_data):
    X_predict = vectorizer.transform(predict_data)
    y_predict = model.predict(X_predict)
    return y_predict

In [17]:
# 主函数
def main():
    folder_path = r'E:\研一\计算传播\给学生1\第一步分类'  # 替换为实际文件夹路径
    predict_file_path = r'E:\研一\计算传播\给学生1\第一步分类\predict.txt'  # 替换为predict.txt的实际路径

    # 加载数据
    train_data, train_labels = load_train_data(folder_path)
    predict_data = load_predict_data(predict_file_path)

    # 训练模型
    model, vectorizer = train_model(train_data, train_labels)

    # 进行预测
    predictions = predict(model, vectorizer, predict_data)
    for text, label in zip(predict_data, predictions):
        print(f"Text: {text}\nPredicted Label: {label}\n")

if __name__ == "__main__":
    main()

Validation Accuracy: 0.8842430312134869
              precision    recall  f1-score   support

          体育       0.96      0.94      0.95      1504
          军事       0.38      0.01      0.01       661
          政治       0.87      0.92      0.90      1249
          文化       0.85      0.98      0.91      5099
          社会       0.95      0.87      0.91       833
          科技       0.91      0.86      0.88      1429
          经济       0.88      0.91      0.89      1207

    accuracy                           0.88     11982
   macro avg       0.83      0.78      0.78     11982
weighted avg       0.86      0.88      0.86     11982

Text: Area,KeyWords,Name,Verified,Language,Content
Predicted Label: 社会

Text: 经济,high-speed rail china,Cata Paul,t,en, China's High Speed Rail Evolution Courtesy of @Civixplorer
Predicted Label: 经济

Text: 经济,express delivery china,Mem #CaffeCinos,f,en,This guy use to be the delivery driver for China express on William barefoot
Predicted Label: 经济

Text: 经济,high

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [19]:
# 训练逻辑回归模型
def train_logistic_regression_model(data, labels):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    y = labels
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 使用逻辑回归模型
    model = LogisticRegression(multi_class='auto', max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_pred))
    print(classification_report(y_val, y_pred))
    return model, vectorizer

# 预测
def predict(model, vectorizer, predict_data):
    X_predict = vectorizer.transform(predict_data)
    y_predict = model.predict(X_predict)
    return y_predict

# 主函数
def main():
    folder_path = r'E:\研一\计算传播\给学生1\第一步分类'  # 替换为实际文件夹路径
    predict_file_path = r'E:\研一\计算传播\给学生1\第一步分类\predict.txt'  # 替换为predict.txt的实际路径

    # 加载数据
    train_data, train_labels = load_train_data(folder_path)
    predict_data = load_predict_data(predict_file_path)

    # 训练逻辑回归模型
    model, vectorizer = train_logistic_regression_model(train_data, train_labels)

    # 进行预测
    predictions = predict(model, vectorizer, predict_data)
    for text, label in zip(predict_data, predictions):
        print(f"Text: {text}\nPredicted Label: {label}\n")

if __name__ == "__main__":
    main()

Validation Accuracy: 0.9112001335336338
              precision    recall  f1-score   support

          体育       0.99      0.95      0.97      1504
          军事       0.81      0.04      0.08       661
          政治       0.97      0.96      0.96      1249
          文化       0.85      0.99      0.92      5099
          社会       0.97      0.94      0.96       833
          科技       0.95      0.89      0.92      1429
          经济       0.97      0.96      0.96      1207

    accuracy                           0.91     11982
   macro avg       0.93      0.82      0.82     11982
weighted avg       0.91      0.91      0.89     11982

Text: Area,KeyWords,Name,Verified,Language,Content
Predicted Label: 文化

Text: 经济,high-speed rail china,Cata Paul,t,en, China's High Speed Rail Evolution Courtesy of @Civixplorer
Predicted Label: 经济

Text: 经济,express delivery china,Mem #CaffeCinos,f,en,This guy use to be the delivery driver for China express on William barefoot
Predicted Label: 经济

Text: 经济,high


Text: 文化,traditional Chinese medical science,Chill!oe,f,en,Being barefoot helps to massage the acupoints on ur foot bottom according to the traditional Chinese medical science I know.
Predicted Label: 文化

Text: 文化,traditional Chinese medical science,Breo WoWo,f,en,A view of Hand Acupunctures. WoWo Smart Hand Massager combines technology with traditional Chinese medical science.
Predicted Label: 文化

Text: 文化,traditional Chinese medical science,Try Performance,f,en,NATURAL Performance is based on the theories of traditional Chinese health and medical science, extracting the... http://fb.me/4bKbFyalk
Predicted Label: 文化

Text: 文化,traditional Chinese medical science,Shaila Karim,f,en,How does acupuncture work? - How Acupuncture Works In Traditional Chinese Medical science, the body possess energy... http://is.gd/HSuKEV
Predicted Label: 文化

Text: 文化,traditional Chinese medical science,TubeNews en,f,en,Peruvians marvel at traditional Chinese medical science NEWS &gt;&gt; http://ift.tt/1fBCd