In [1]:
!pip install transformers



In [2]:
pip install transformers[torch]



In [3]:
!pip install -U PyPDF2
!pip install python-docx



In [4]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx

In [5]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text


In [9]:
# Read documents from the directory
train_directory = '/content/data/'
text_data = read_documents_from_directory(train_directory)
text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters

In [11]:
# Save the training and validation data as text files
with open("/content/data/train.txt", "w") as f:
    f.write(text_data)

In [12]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [13]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [14]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [15]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [20]:
train_file_path = "/content/train.txt"
model_name = 'gpt2'
output_dir = '/content/Models/'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000

In [21]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss


Inference

In [18]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [19]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

The following model was trained on 100 questions and answers based on the original text and it trained in a few seconds (50 epochs). It gives very meaningful results.

In [41]:
model2_path = "/content/Models/"
sequence2 = "[Q] Provide information about the role of genetics in heart health"
max_len = 40
generate_text(model2_path, sequence2, max_len)

[Q] Provide information about the role of genetics in heart health?
[A] Genetics can influence the risk of heart disease; individuals with a family history may have a higher predisposition.



In [38]:
model2_path = "/content/Models/"
sequence2 = "[Q] Explain the risk factors associated with cardiovascular diseases?"
max_len = 40
generate_text(model2_path, sequence2, max_len)

[Q] Explain the risk factors associated with cardiovascular diseases?
[A] Factors such as genetics, diet, exercise, and stress can contribute to cardiovascular diseases.
[


In [40]:
model2_path = "/content/Models/"
sequence2 = "[Q] What are the symptoms of a heart attack?"
max_len = 40
generate_text(model2_path, sequence2, max_len)

[Q] What are the symptoms of a heart attack?
[A] Symptoms of a heart attack may include chest pain, fatigue, swelling in the legs, or difficulty breathing.
[


In [58]:
model2_path = "/content/Models/"
sequence2 = "[Q] What is the connection between gum disease and heart health?"
max_len = 40
generate_text(model2_path, sequence2, max_len)

[Q] What is the connection between gum disease and heart health?
[A] Gum disease has been linked to an increased risk of heart disease; maintaining good oral hygiene may contribute to cardiovascular health.



In [60]:
model2_path = "/content/Models/"
sequence2 = "[Q] How is a pacemaker used in treating heart rhythm disorders?"
max_len = 50
generate_text(model2_path, sequence2, max_len)

[Q] How is a pacemaker used in treating heart rhythm disorders?
[A] A pacemaker is a device implanted in the chest to regulate heart rhythm by emitting electrical impulses to control heartbeat when the natural electrical system is faulty.
[


In [27]:
model2_path = "/content/Models/"
sequence2 = "[Q] How does diabetes contribute to the development of heart conditions?"
max_len = 50
generate_text(model2_path, sequence2, max_len)

[Q] How does diabetes contribute to the development of heart conditions?
[A] Diabetes can damage blood vessels, increase the risk of heart disease, and increase the risk of cardiovascular events; managing blood sugar levels is important for heart health.



In [28]:
model2_path = "/content/Models/"
sequence2 = "[Q] Describe the role of anticoagulant medications in cardiovascular treatment?"
max_len = 50
generate_text(model2_path, sequence2, max_len)

[Q] Describe the role of anticoagulant medications in cardiovascular treatment?
[A] Anticoagulants help prevent blood clots, reducing the risk of stroke and other complications in individuals with certain heart conditions.
[


In [64]:
model2_path = "/content/Models/"
sequence2 = "[Q] What is the impact of secondhand smoke on heart health?"
max_len = 50
generate_text(model2_path, sequence2, max_len)

[Q] What is the impact of secondhand smoke on heart health?
[A] Exposure to secondhand smoke increases the risk of heart disease by affecting blood vessels and promoting the development of atherosclerosis.
[


In [69]:
model2_path = "/content/Models/"
sequence2 = "[Q] How does long-term exposure to air pollution impact heart health?"
max_len = 50
generate_text(model2_path, sequence2, max_len)

[Q] How does long-term exposure to air pollution impact heart health?
[A] Long-term exposure to air pollution is associated with an increased risk of heart disease, as pollutants can have detrimental effects on the cardiovascular system.
