## 数据读取


In [23]:
# 导入必要的库
import pandas as pd
import os
from PIL import Image
import numpy as np
from tqdm import tqdm

In [24]:
# 设置数据路径
BASE_PATH = "memotion_dataset_7k"
IMAGE_PATH = os.path.join(BASE_PATH, "images")
LABELS_PATH = os.path.join(BASE_PATH, "labels.csv")

# 读取标签数据
df_labels = pd.read_csv(LABELS_PATH,index_col=0)

# 读取图像文件
def load_image(image_name):
    try:
        img_path = os.path.join(IMAGE_PATH, image_name)
        img = Image.open(img_path)
        return img
    except Exception as e:
        print(f"Error loading image {image_name}: {str(e)}")
        return None

# 创建图像数据字典
images_dict = {}
for img_name in tqdm(df_labels['image_name'], desc="Loading images"):
    img = load_image(img_name)
    if img is not None:
        images_dict[img_name] = img

print(f"Successfully loaded {len(images_dict)} images")
print(f"Labels shape: {df_labels.shape}")
# 显示标签数据的前几行
print("\nLabels preview:")
print(df_labels.head())


Loading images: 100%|██████████| 6992/6992 [00:01<00:00, 4435.35it/s]

Successfully loaded 6992 images
Labels shape: (6992, 8)

Labels preview:
     image_name                                           text_ocr  \
0   image_1.jpg  LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...   
1  image_2.jpeg  The best of #10 YearChallenge! Completed in le...   
2   image_3.JPG  Sam Thorne @Strippin ( Follow Follow Saw every...   
3   image_4.png              10 Year Challenge - Sweet Dee Edition   
4   image_5.png  10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...   

                                      text_corrected      humour  \
0  LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...   hilarious   
1  The best of #10 YearChallenge! Completed in le...   not_funny   
2  Sam Thorne @Strippin ( Follow Follow Saw every...  very_funny   
3              10 Year Challenge - Sweet Dee Edition  very_funny   
4  10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...   hilarious   

           sarcasm       offensive      motivational overall_sentiment  
0          general   not




### 文本预处理步骤
1. 添加必要的 NLTK 库
2. 下载必要的资源
3. 创建文本预处理函数
4. 应用预处理并进行分类

In [25]:
# 导入额外的预处理库
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re


In [26]:
print(nltk.data.path) 

['C:\\Users\\86177/nltk_data', 'd:\\Anaconda\\envs\\Hatememes\\nltk_data', 'd:\\Anaconda\\envs\\Hatememes\\share\\nltk_data', 'd:\\Anaconda\\envs\\Hatememes\\lib\\nltk_data', 'C:\\Users\\86177\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'E:\\nltk_data']


In [27]:
# 下载必要的NLTK资源
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to D:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to D:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to D:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to D:\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
# 下载和验证 NLTK 资源
import nltk

def download_nltk_resources():
    """下载必要的 NLTK 资源并验证"""
    resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
    
    for resource in resources:
        try:
            nltk.data.find(f'tokenizers/{resource}')
            print(f'√ {resource} 已存在')
        except LookupError:
            print(f'下载 {resource} ...')
            nltk.download(resource, download_dir='D:\\nltk_data')
            print(f'√ {resource} 下载完成')

# 执行下载和验证
print("检查和下载 NLTK 资源...")
download_nltk_resources()
print("\n所有必要资源已准备就绪")

检查和下载 NLTK 资源...
下载 punkt ...
√ punkt 下载完成
下载 stopwords ...
√ stopwords 下载完成
下载 wordnet ...
√ wordnet 下载完成
下载 omw-1.4 ...
√ omw-1.4 下载完成

所有必要资源已准备就绪


[nltk_data] Downloading package punkt to D:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to D:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to D:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to D:\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [29]:
# 创建预处理函数
def preprocess_text(text):
    # 转换为小写
    text = text.lower()
    
    # 移除 URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 移除标点符号
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 分词
    tokens = word_tokenize(text)
    
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # 词形还原
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # 重新组合成文本
    return ' '.join(tokens)

## 仅使用文本进行分类

我将帮你创建一个使用简单机器学习模型进行情感分类的代码。我们将使用 scikit-learn 库来实现这个分类任务。这里我们主要关注 `offensive` 这个情感分类标签。

这段代码实现了以下功能：

1. 使用 TF-IDF 将文本转换为特征向量
2. 对情感标签进行编码
3. 将数据集分为训练集和测试集
4. 使用随机森林分类器进行训练
5. 评估模型性能并输出分类报告
6. 展示最重要的特征词
7. 可视化特征词的权重

In [30]:
# 导入必要的库
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [31]:
# 数据预处理
# 使用text_corrected作为特征
X = df_labels['text_corrected'].fillna('').apply(preprocess_text)  # 填充缺失值
y = df_labels['overall_sentiment']

# 标签编码
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 文本特征提取
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_encoded, test_size=0.2, random_state=42
)

# 训练随机森林分类器
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# 预测和评估
y_pred = rf_classifier.predict(X_test)

# 打印分类报告
print("分类报告：")
print(classification_report(y_test, y_pred, 
                          target_names=le.classes_))

# 查看特征重要性
feature_importance = pd.DataFrame({
    'feature': vectorizer.get_feature_names_out(),
    'importance': rf_classifier.feature_importances_
})
print("\n最重要的10个特征：")
print(feature_importance.nlargest(10, 'importance'))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\86177/nltk_data'
    - 'd:\\Anaconda\\envs\\Hatememes\\nltk_data'
    - 'd:\\Anaconda\\envs\\Hatememes\\share\\nltk_data'
    - 'd:\\Anaconda\\envs\\Hatememes\\lib\\nltk_data'
    - 'C:\\Users\\86177\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [32]:
y.value_counts()

NameError: name 'y' is not defined