In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Read the data
file_path = '/home/n11504439/cab420/420/emails_converted_to_csv_output.csv'
data = pd.read_csv(file_path)

# Rename the 'from' column to avoid using a reserved keyword
data.rename(columns={'from_': 'from'}, inplace=True)

# Get the top 100 senders by email count
top_senders = data['from'].value_counts().head(100).index

# Filter the data to include only emails from these top 100 senders
filtered_data = data[data['from'].isin(top_senders)]

# Use stratified sampling to split the data into training, validation, and test sets (70%, 15%, 15%)
train_data, temp_data = train_test_split(filtered_data, test_size=0.3, stratify=filtered_data['from'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['from'], random_state=42)

# Prepare feature and target variables
X_train = train_data['body']
y_train = train_data['from']
X_val = val_data['body']
y_val = val_data['from']
X_test = test_data['body']
y_test = test_data['from']

# Vectorize the email content using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Random Forest model with balanced class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the training set
train_predictions = rf_model.predict(X_train_tfidf)
train_accuracy = accuracy_score(y_train, train_predictions)

# Evaluate the model on the validation set
val_predictions = rf_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, val_predictions)

# Evaluate the model on the test set
test_predictions = rf_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)

# Output training, validation, and test set accuracies
print("Training Accuracy: ", train_accuracy)
print("Validation Accuracy: ", val_accuracy)
print("Test Accuracy: ", test_accuracy)


Train Accuracy:  0.9844978165938865
Validation Accuracy:  0.6576668364747835
Test Accuracy:  0.6583503054989817


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 读取数据
file_path = '/home/n11504439/cab420/420/emails_converted_to_csv_output.csv'
data = pd.read_csv(file_path)

# 将 'from' 列重命名以避免使用保留关键字
data.rename(columns={'from_': 'from'}, inplace=True)

# 获取邮件数排名前100的发件人
top_senders = data['from'].value_counts().head(100).index

# 过滤数据，仅包含这些前100的发件人的邮件
filtered_data = data[data['from'].isin(top_senders)]

# 使用分层抽样将数据拆分为训练集、验证集和测试集（70%，15%，15%）
train_data, temp_data = train_test_split(filtered_data, test_size=0.3, stratify=filtered_data['from'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['from'], random_state=42)

# 准备特征和目标变量
X_train = train_data['body']
y_train = train_data['from']
X_val = val_data['body']
y_val = val_data['from']
X_test = test_data['body']
y_test = test_data['from']

# 使用TF-IDF对邮件内容进行向量化
vectorizer = TfidfVectorizer(max_features=10000)  # 增加max_features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# 定义随机森林模型
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# 定义参数网格进行超参数调优
param_grid = {
    'n_estimators': [100, 200, 300],  # 增加树的数量
    'max_depth': [10, 20, 30],  # 增加树的深度
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 使用GridSearchCV进行超参数调优
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

# 获取最佳模型
best_rf_model = grid_search.best_estimator_

# 在训练集上评估最佳模型
train_predictions = best_rf_model.predict(X_train_tfidf)
train_accuracy = accuracy_score(y_train, train_predictions)

# 在验证集上评估最佳模型
val_predictions = best_rf_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, val_predictions)

# 在测试集上评估最佳模型
test_predictions = best_rf_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, test_predictions)

# 输出训练集、验证集和测试集准确率
print("Training Accuracy: ", train_accuracy)
print("Validation Accuracy: ", val_accuracy)
print("Test Accuracy: ", test_accuracy)


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Training Accuracy:  0.8362445414847162
Validation Accuracy:  0.6067244014263882
Test Accuracy:  0.6115071283095723
