In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW

# Define your custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        self.tokenizer = AutoTokenizer.from_pretrained("philippelaban/keep_it_simple",force_download=True, resume_download=False)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

# Custom training loop
def train(model, dataset, optimizer, device):
    model.train()
    data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Prepare your custom dataset
with open(r"wiki.full.aner.ori.train.dst", 'r', encoding='utf-8') as file:
        custom_texts = file.readlines()[:1000]

# Prepare your custom dataset
with open(r"tune.8turkers.tok.turk.4", 'r', encoding='utf-8') as file:
        custom_texts += file.readlines()[:1000]

dataset = CustomDataset(custom_texts)

# Instantiate the model and optimizer
model = AutoModelForCausalLM.from_pretrained("philippelaban/keep_it_simple")
optimizer = AdamW(model.parameters(), lr=1e-5)

# Set up GPU training if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train the model
train(model, dataset, optimizer, device)

# Save the trained model
torch.save(model.state_dict(), "trained_model.pt")

Downloading (…)okenizer_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/108 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]



In [6]:
import pandas as pd

data = pd.read_json('simpletext_task3_train (1) (1) (1) (1).json')
data

Unnamed: 0,query_id,query_text,doc_id,snt_id,source_snt
0,G11.1,drones,2892036907,G11.1_2892036907_1,"In the modern era of automation and robotics, ..."
1,G11.1,drones,2892036907,G11.1_2892036907_2,With the ever increasing number of unmanned ae...
2,G11.1,drones,2892036907,G11.1_2892036907_3,Due to guidelines set by the governments regar...
3,G11.1,drones,2892036907,G11.1_2892036907_4,In an attempt to achieve the above mentioned t...
4,G11.1,drones,2892036907,G11.1_2892036907_5,Derived from the classic image classification ...
...,...,...,...,...,...
643,M9,Mechanisms of Muscle Hypertrophy,41,M9_41_3,Bodybuilders generally train with moderate loa...
644,M9,Mechanisms of Muscle Hypertrophy,41,M9_41_4,"Powerlifters, on the other hand, routinely tra..."
645,M9,Mechanisms of Muscle Hypertrophy,41,M9_41_5,Although both groups are known to display impr...
646,M9,Mechanisms of Muscle Hypertrophy,41,M9_41_6,It has been shown that many factors mediate th...


In [8]:
# !git clone https://github.com/feralvam/easse.git
!cd easse
!pip install -e .

Obtaining file:///C:/python/projects/Upwork-Projects/Liva/Simplification


ERROR: file:///C:/python/projects/Upwork-Projects/Liva/Simplification does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.


In [1]:
from easse.sari import corpus_sari

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model
model = AutoModelForCausalLM.from_pretrained("path/to/trained_model_directory")
tokenizer = AutoTokenizer.from_pretrained("path/to/trained_model_directory")

# Set up GPU training if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example input text
input_text = "Input text goes here."

# Tokenize the input text
inputs = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True,
    padding="max_length",
    max_length=128,
    truncation=True,
    return_tensors="pt"
)
input_ids = inputs["input_ids"].squeeze().to(device)
attention_mask = inputs["attention_mask"].squeeze().to(device)

# Generate output from the model
outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)

# Decode the generated output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated output
print("Generated text:", generated_text)