In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import OneHotEncoder

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
def load_txt_data(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts

In [5]:
def load_csv_data(csv_file_path):
    df = pd.read_csv(csv_file_path) 
    return df['content'].tolist()

In [6]:
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        self.encoder = OneHotEncoder(sparse=False)
        self.encoded_texts = self.encoder.fit_transform(np.array(texts).reshape(-1, 1))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.encoded_texts[idx], dtype=torch.float32)

In [7]:
folder_path = '/kaggle/input/sherlock-holmes-collection/collection of sherlock holmes/sherlock' 
txt_stories = load_txt_data(folder_path)

csv_file_path = '/kaggle/input/sherlock-holmes-collection/collection of sherlock holmes/holmes.csv'   
csv_stories = load_csv_data(csv_file_path)

In [8]:
all_stories = txt_stories + csv_stories

In [9]:
dataset = TextDataset(all_stories)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [10]:
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, output_dim),
            nn.Tanh()
        )

    def forward(self, x): 
        return self.fc(x)

In [11]:
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 1)  
        )

    def forward(self, x): 
        return self.fc(x)

In [12]:
criterion = nn.BCEWithLogitsLoss()

In [13]:
input_dim = 10
output_dim = len(dataset[0])

In [14]:
generator = Generator(input_dim, output_dim).to(device)
discriminator = Discriminator(output_dim).to(device)

In [15]:
optimizer_g = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0002) 

In [16]:
num_epochs = 5000

In [17]:
for epoch in range(num_epochs):
    for real_data in dataloader:
        real_data = real_data.to(device)
        batch_size = real_data.size(0)

        optimizer_d.zero_grad()
        noise = torch.randn(batch_size, input_dim, device=device)
        fake_data = generator(noise)

        label_real = torch.full((batch_size, 1), 0.9, device=device) 
        label_fake = torch.zeros(batch_size, 1, device=device)

        output_real = discriminator(real_data)
        loss_real = criterion(output_real, label_real)
        
        output_fake = discriminator(fake_data.detach())
        loss_fake = criterion(output_fake, label_fake)

        loss_d = loss_real + loss_fake
        loss_d.backward()
        optimizer_d.step()

        optimizer_g.zero_grad()
        output = discriminator(fake_data)
        loss_g = criterion(output, label_real) 
        loss_g.backward()
        optimizer_g.step()

    if epoch % 100 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], Loss D: {loss_d.item():.4f}, Loss G: {loss_g.item():.4f}')


Epoch [0/5000], Loss D: 1.3667, Loss G: 0.7412
Epoch [100/5000], Loss D: 1.3865, Loss G: 0.7726
Epoch [200/5000], Loss D: 1.3497, Loss G: 0.7604
Epoch [300/5000], Loss D: 1.2655, Loss G: 0.8206
Epoch [400/5000], Loss D: 1.0815, Loss G: 0.9601
Epoch [500/5000], Loss D: 0.9068, Loss G: 1.2248
Epoch [600/5000], Loss D: 0.7803, Loss G: 1.4657
Epoch [700/5000], Loss D: 0.4472, Loss G: 1.9692
Epoch [800/5000], Loss D: 0.4562, Loss G: 1.8092
Epoch [900/5000], Loss D: 0.4309, Loss G: 3.0152
Epoch [1000/5000], Loss D: 0.4465, Loss G: 2.3650
Epoch [1100/5000], Loss D: 0.4319, Loss G: 2.5423
Epoch [1200/5000], Loss D: 0.4179, Loss G: 3.3025
Epoch [1300/5000], Loss D: 0.5177, Loss G: 2.8169
Epoch [1400/5000], Loss D: 0.3747, Loss G: 3.1298
Epoch [1500/5000], Loss D: 0.4305, Loss G: 2.9701
Epoch [1600/5000], Loss D: 0.6292, Loss G: 2.5682
Epoch [1700/5000], Loss D: 0.4754, Loss G: 2.5351
Epoch [1800/5000], Loss D: 0.4218, Loss G: 2.6680
Epoch [1900/5000], Loss D: 0.4189, Loss G: 3.0865
Epoch [2000/