# 作业problem1

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

dataset = pd.read_csv("../Week01/dataset.csv", sep="\t", header=None)
texts = dataset[0].tolist()
string_labels = dataset[1].tolist()

label_to_index = {label: i for i, label in enumerate(set(string_labels))}
numerical_labels = [label_to_index[label] for label in string_labels]

char_to_index = {'<pad>': 0}
for text in texts:
    for char in text:
        if char not in char_to_index:
            char_to_index[char] = len(char_to_index)

index_to_char = {i: char for char, i in char_to_index.items()}
vocab_size = len(char_to_index)

# max length 最大输入的文本长度
max_len = 40

In [21]:
# 自定义数据集 - 》 为每个任务定义单独的数据集的读取方式，这个任务的输入和输出
# 统一的写法，底层pytorch 深度学习 / 大模型
class CharLSTMDataset(Dataset):
    # 初始化
    def __init__(self, texts, labels, char_to_index, max_len):
        self.texts = texts # 文本输入
        self.labels = torch.tensor(labels, dtype=torch.long) # 文本对应的标签
        self.char_to_index = char_to_index # 字符到索引的映射关系
        self.max_len = max_len # 文本最大输入长度

    # 返回数据集样本个数
    def __len__(self):
        return len(self.texts)

    # 获取当个样本
    def __getitem__(self, idx):
        text = self.texts[idx]
        # pad and crop
        indices = [self.char_to_index.get(char, 0) for char in text[:self.max_len]]
        indices += [0] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long), self.labels[idx]

# a = CharLSTMDataset()
# len(a) -> a.__len__
# a[0] -> a.__getitem__


# --- NEW LSTM Model Class ---


class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()

        # 词表大小 转换后维度的维度
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # 随机编码的过程， 可训练的
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)  # 循环层
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # batch size * seq length -》 batch size * seq length * embedding_dim
        embedded = self.embedding(x)

        # batch size * seq length * embedding_dim -》 batch size * seq length * hidden_dim
        lstm_out, (hidden_state, cell_state) = self.lstm(embedded)

        # batch size * output_dim
        out = self.fc(hidden_state.squeeze(0))
        return out
    
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNClassifier, self).__init__()

        # 词表大小 转换后维度的维度
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # 随机编码的过程， 可训练的
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)  # 循环层
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # batch size * seq length -》 batch size * seq length * embedding_dim
        embedded = self.embedding(x)

        # batch size * seq length * embedding_dim -》 batch size * seq length * hidden_dim
        lstm_out ,hidden_state = self.rnn(embedded)

        # batch size * output_dim
        out = self.fc(hidden_state.squeeze(0))
        return out
    
    
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(GRUClassifier, self).__init__()

        # 词表大小 转换后维度的维度
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # 随机编码的过程， 可训练的
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)  # 循环层
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # batch size * seq length -》 batch size * seq length * embedding_dim
        embedded = self.embedding(x)

        # batch size * seq length * embedding_dim -》 batch size * seq length * hidden_dim
        lstm_out ,hidden_state = self.gru(embedded)

        # batch size * output_dim
        out = self.fc(hidden_state.squeeze(0))
        return out

In [5]:
# --- Training and Prediction ---
lstm_dataset = CharLSTMDataset(texts, numerical_labels, char_to_index, max_len)
dataloader = DataLoader(lstm_dataset, batch_size=32, shuffle=True)

embedding_dim = 64
hidden_dim = 128
output_dim = len(label_to_index)

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for idx, (inputs, labels) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if idx % 50 == 0:
            print(f"Batch 个数 {idx}, 当前Batch Loss: {loss.item()}")

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")

def classify_text_lstm(text, model, char_to_index, max_len, index_to_label):
    indices = [char_to_index.get(char, 0) for char in text[:max_len]]
    indices += [0] * (max_len - len(indices))
    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0)

    model.eval()
    with torch.no_grad():
        output = model(input_tensor)

    _, predicted_index = torch.max(output, 1)
    predicted_index = predicted_index.item()
    predicted_label = index_to_label[predicted_index]

    return predicted_label

index_to_label = {i: label for label, i in label_to_index.items()}

new_text = "帮我导航到北京"
predicted_class = classify_text_lstm(new_text, model, char_to_index, max_len, index_to_label)
print(f"输入 '{new_text}' 预测为: '{predicted_class}'")

new_text_2 = "查询明天北京的天气"
predicted_class_2 = classify_text_lstm(new_text_2, model, char_to_index, max_len, index_to_label)
print(f"输入 '{new_text_2}' 预测为: '{predicted_class_2}'")

Batch 个数 0, 当前Batch Loss: 2.510615110397339
Batch 个数 50, 当前Batch Loss: 2.270514488220215
Batch 个数 100, 当前Batch Loss: 2.3900198936462402
Batch 个数 150, 当前Batch Loss: 2.275806188583374
Batch 个数 200, 当前Batch Loss: 2.38065242767334
Batch 个数 250, 当前Batch Loss: 2.436516046524048
Batch 个数 300, 当前Batch Loss: 2.412123441696167
Batch 个数 350, 当前Batch Loss: 2.4185950756073
Epoch [1/4], Loss: 2.3603
Batch 个数 0, 当前Batch Loss: 2.4016170501708984
Batch 个数 50, 当前Batch Loss: 2.3760714530944824
Batch 个数 100, 当前Batch Loss: 2.230572462081909
Batch 个数 150, 当前Batch Loss: 1.7764116525650024
Batch 个数 200, 当前Batch Loss: 1.7029638290405273
Batch 个数 250, 当前Batch Loss: 1.8981480598449707
Batch 个数 300, 当前Batch Loss: 1.4930039644241333
Batch 个数 350, 当前Batch Loss: 1.346110224723816
Epoch [2/4], Loss: 1.8994
Batch 个数 0, 当前Batch Loss: 1.4896399974822998
Batch 个数 50, 当前Batch Loss: 1.2709661722183228
Batch 个数 100, 当前Batch Loss: 1.1809909343719482
Batch 个数 150, 当前Batch Loss: 1.3678020238876343
Batch 个数 200, 当前Batch Loss: 1

In [16]:
# --- RNN ---
lstm_dataset = CharLSTMDataset(texts, numerical_labels, char_to_index, max_len)
dataloader = DataLoader(lstm_dataset, batch_size=32, shuffle=True)

embedding_dim = 64
hidden_dim = 128
output_dim = len(label_to_index)

model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for idx, (inputs, labels) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if idx % 50 == 0:
            print(f"Batch 个数 {idx}, 当前Batch Loss: {loss.item()}")

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")

def classify_text_lstm(text, model, char_to_index, max_len, index_to_label):
    indices = [char_to_index.get(char, 0) for char in text[:max_len]]
    indices += [0] * (max_len - len(indices))
    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0)

    model.eval()
    with torch.no_grad():
        output = model(input_tensor)

    _, predicted_index = torch.max(output, 1)
    predicted_index = predicted_index.item()
    predicted_label = index_to_label[predicted_index]

    return predicted_label

index_to_label = {i: label for label, i in label_to_index.items()}

new_text = "帮我导航到北京"
predicted_class = classify_text_lstm(new_text, model, char_to_index, max_len, index_to_label)
print(f"输入 '{new_text}' 预测为: '{predicted_class}'")

new_text_2 = "查询明天北京的天气"
predicted_class_2 = classify_text_lstm(new_text_2, model, char_to_index, max_len, index_to_label)
print(f"输入 '{new_text_2}' 预测为: '{predicted_class_2}'")

Batch 个数 0, 当前Batch Loss: 2.43194580078125
Batch 个数 50, 当前Batch Loss: 2.4570977687835693
Batch 个数 100, 当前Batch Loss: 2.3653411865234375
Batch 个数 150, 当前Batch Loss: 2.313234806060791
Batch 个数 200, 当前Batch Loss: 2.3763866424560547
Batch 个数 250, 当前Batch Loss: 2.311861276626587
Batch 个数 300, 当前Batch Loss: 2.386493444442749
Batch 个数 350, 当前Batch Loss: 2.338043689727783
Epoch [1/4], Loss: 2.3681
Batch 个数 0, 当前Batch Loss: 2.3902225494384766
Batch 个数 50, 当前Batch Loss: 2.270555019378662
Batch 个数 100, 当前Batch Loss: 2.314812660217285
Batch 个数 150, 当前Batch Loss: 2.281960964202881
Batch 个数 200, 当前Batch Loss: 2.2433958053588867
Batch 个数 250, 当前Batch Loss: 2.36765456199646
Batch 个数 300, 当前Batch Loss: 2.425328493118286
Batch 个数 350, 当前Batch Loss: 2.4008092880249023
Epoch [2/4], Loss: 2.3616
Batch 个数 0, 当前Batch Loss: 2.285299062728882
Batch 个数 50, 当前Batch Loss: 2.5932559967041016
Batch 个数 100, 当前Batch Loss: 2.489384174346924
Batch 个数 150, 当前Batch Loss: 2.328425168991089
Batch 个数 200, 当前Batch Loss: 2.40

In [22]:
# --- RNN ---
lstm_dataset = CharLSTMDataset(texts, numerical_labels, char_to_index, max_len)
dataloader = DataLoader(lstm_dataset, batch_size=32, shuffle=True)

embedding_dim = 64
hidden_dim = 128
output_dim = len(label_to_index)

model = GRUClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for idx, (inputs, labels) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if idx % 50 == 0:
            print(f"Batch 个数 {idx}, 当前Batch Loss: {loss.item()}")

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")

def classify_text_lstm(text, model, char_to_index, max_len, index_to_label):
    indices = [char_to_index.get(char, 0) for char in text[:max_len]]
    indices += [0] * (max_len - len(indices))
    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0)

    model.eval()
    with torch.no_grad():
        output = model(input_tensor)

    _, predicted_index = torch.max(output, 1)
    predicted_index = predicted_index.item()
    predicted_label = index_to_label[predicted_index]

    return predicted_label

index_to_label = {i: label for label, i in label_to_index.items()}

new_text = "帮我导航到北京"
predicted_class = classify_text_lstm(new_text, model, char_to_index, max_len, index_to_label)
print(f"输入 '{new_text}' 预测为: '{predicted_class}'")

new_text_2 = "查询明天北京的天气"
predicted_class_2 = classify_text_lstm(new_text_2, model, char_to_index, max_len, index_to_label)
print(f"输入 '{new_text_2}' 预测为: '{predicted_class_2}'")

Batch 个数 0, 当前Batch Loss: 2.458599090576172
Batch 个数 50, 当前Batch Loss: 2.518076181411743
Batch 个数 100, 当前Batch Loss: 2.2380869388580322
Batch 个数 150, 当前Batch Loss: 1.2760915756225586
Batch 个数 200, 当前Batch Loss: 1.3909823894500732
Batch 个数 250, 当前Batch Loss: 0.9436416029930115
Batch 个数 300, 当前Batch Loss: 0.7190680503845215
Batch 个数 350, 当前Batch Loss: 0.7907894849777222
Epoch [1/4], Loss: 1.3507
Batch 个数 0, 当前Batch Loss: 0.39854857325553894
Batch 个数 50, 当前Batch Loss: 0.3317549228668213
Batch 个数 100, 当前Batch Loss: 0.300327867269516
Batch 个数 150, 当前Batch Loss: 0.20964555442333221
Batch 个数 200, 当前Batch Loss: 0.2956169843673706
Batch 个数 250, 当前Batch Loss: 0.4843326210975647
Batch 个数 300, 当前Batch Loss: 0.6049566864967346
Batch 个数 350, 当前Batch Loss: 0.7831342816352844
Epoch [2/4], Loss: 0.4439
Batch 个数 0, 当前Batch Loss: 0.3295494616031647
Batch 个数 50, 当前Batch Loss: 0.40670162439346313
Batch 个数 100, 当前Batch Loss: 0.20665298402309418
Batch 个数 150, 当前Batch Loss: 0.1732630878686905
Batch 个数 200, 当前