In [1]:
import torch
from transformers import MarianMTModel, MarianTokenizer, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel
from datasets import load_dataset
import evaluate
from context import summarize_text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = "Iker/Document-Translation-en-es"

summary_model_name = "google-t5/t5-small"
translation_model_name = "Helsinki-NLP/opus-mt-en-es"
revision = "main"

tokenizer_model_name = "distilbert-base-uncased"
model_name = "Helsinki-NLP/opus-mt-en-es"

device = 0 if torch.cuda.is_available() else -1

# Get Context

In [3]:
text_file = 'document.txt'

summarize_text(text_file, max_length=35)

'document.txt. document.txt. document.txt.'

## Base summariser

In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained(summary_model_name)
tokenizer = AutoTokenizer.from_pretrained(summary_model_name)
text = """
The Hugging Face Transformers library provides state-of-the-art general-purpose architectures
for natural language understanding (NLU) and natural language generation (NLG). These architectures
include BERT, GPT, GPT-2, BART, and T5, which can be applied to text classification, summarization, translation, and more.
"""

inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs, max_length=100, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary)

the Hugging Face Transformers library provides state-of-the-art architectures for natural language understanding (NLU) and natural language generation (NLG). these architectures include BERT, GPT, GPT-2, and BART.


In [5]:
text = 'Translate the following text from English to Spanish:  FIFA introduced anti-doping controls in 1966, but it wasn\'t until 1974 that the first case occurred in a World Cup, and since then, there have only been three other positive cases. The first one was Haitian player Ernst Jean-Joseph, during the 1974 World Cup in Germany. The red-haired mixed-race midfielder tested positive for ephedrine - he claimed he had taken some asthma pills - after the match against Poland, but he was not sanctioned by FIFA. The punishment was imposed by dictator Jean Claude Duvalier, who two days later demanded his return to the Caribbean country: the Haitian police forcibly removed him from the hotel in front of the international press, put him into a car, and took him to the airport, back to his country to explain himself. Four years later, Scottish player Willy Johnstone tested positive for fencamfamine - a fatigue recoverer - after the match against Peru during the 1978 World Cup in Argentina. The skillful winger didn\'t have to undergo the initial test, but Archie Gemmill, who was supposed to do it, was severely dehydrated. Johnstone tested positive and his excuses ("I couldn\'t have doped because I played the worst game of my international career," he said) were not enough for the Scottish federation, who sent him back to the United Kingdom. A Spaniard, Barcelona\'s Ramón María Calderé, was the protagonist of the third case in World Cup history, testing positive for ephedrine during the 1986 World Cup in Mexico. A syrup he took with medical prescription from FIFA was the cause, so once the player\'s innocence was proven, the sanction was reduced to a one-game suspension and a fine of 25,000 Swiss francs (around 9,000 current euros) for the Spanish Federation, for not administering it within the 72 hours before the match against Northern Ireland, as required. But if there is one anti-doping control that is well remembered, it is that of Diego Armando Maradona, who tested positive for five substances derived from ephedrine, after the match against Nigeria during the 1994 World Cup in the United States. Expelled from the World Cup and suspended for 15 months, that was the last game for the "Pelusa" with the Albiceleste.', 'Translate the following text from English to Spanish:  The Valencian Institute of Modern Art (IVAM) will exhibit works from today until February 23 by authors from its collection together with those of young Valencian artists, in a show called "Sustratos" (Substrates). Its title refers to the interest in linking the overlap of different artistic stages that take place in the city of Valencia, as explained in a statement by the museum, as well as to the place where this exhibition takes place. Its location is the Muralla room, from where "the IVAM links its own modernity with the tradition of its host city," as it preserves the ancient fortress.\n\nThe exhibition is multidisciplinary and consists of productions by 20 artists under 35 years old who will interact with works by artists who, due to their recognized trajectory, are already present in the IVAM collection. Among the established artists are Eduardo Arroyo, Carmen Calvo, Miquel Navarro, Alberto Corazón, Sanleón, and Teresa Cháfer. Among the emerging artists, we can mention Señor Cifrián, Andrea Gussi, Paola Ruiz Moltó, or Marcos Juncal. In total, 42 creators.\n\nThey claim that "Sustratos" carries out a "new relationship of works, gathered under the simple principle of matching some with others, in encounters that sometimes are obvious and sometimes surprising." "When I delve into the exhibition," points out the curator of the show, Nilo Casares, "I do it based on my circumstance as a resident in Valencia who is not from here, a status that allows me to pay attention to the plurality of origins that all artistic manifestations have, but also the artists themselves."\n\n"In Sustratos," the curator adds, "the challenge is to bring to the Muralla room the possible stories contained in Art History, the IVAM collection, and its development through the latest artistic productions linked to the Valencian Community, which without the IVAM would be different in the artistic field."', 'Translate the following text from English to Spanish:  There is still one matchday left for the end of the Second Division League, and except for a spot for promotion and the specific order for the playoffs, everything is decided. If a few days ago it was Unzue, coach of Numancia, who said he would change teams next season, today three other coaches have confirmed that they will not continue in the same benches next year. They are Javi López from Xerez, Onésimo Sánchez from Huesca, and Lucas Alcaráz from Córdoba, all for different reasons and with the season\'s goals accomplished. One of the most surprising cases is that of Javi López, whose team is the only one of the three still playing for something. Xerez has reached the final matchday with possibilities of getting into the fight for promotion and taking the seventh place from Valladolid, something that will happen if they beat Elche and Valladolid lose to Alcorcón. López stated in a press conference that he informed the club of his decision two months ago and has complained that since then the atmosphere has been "weird." \n\nOnésimo\'s case was also unexpected; he announced it after securing Huesca\'s safety by breaking the points record in the Second Division. "It\'s a very deliberate decision. I think a cycle has ended, and both parties should be happy with what we have achieved," commented the coach; "everything seemed to point towards another year, but in the end, I wasn\'t clear about it." In their final match, Huesca will visit a struggling Albacete.\n\nLucas Alcaraz has explained the reasons for his departure more thoroughly. The Córdoba coach has justified that the club, currently in insolvency proceedings, will soon change ownership, and his bond was with the current owner. "I think, from my side, the commitment to bring the ship safely to port has been fulfilled, and given the change in circumstances, I wanted to announce my goodbye," he stated to the media. Córdoba had an irregular season; halfway through, it seemed they would be fighting at the top, but they had to wait until the last matchdays to secure their stay in the league. On Saturday, they face Girona, who announced today that they are negotiating to renew their coach, Raúl Agné. You can follow EL PAÍS Sports on Facebook, Twitter, or subscribe to the Newsletter here.', 'Translate the following text from English to Spanish:  Radio Valencia Cadena SER has gathered in a debate the five spokespeople of the municipal groups to analyze how the first year of the new local government has been. The opposition has provided a negative assessment, accusing the tripartite of not knowing how to manage and of dividing the citizens. The local government, on the other hand, has emphasized the achievements obtained in this first year of government and has responded to the accusations of the opposition, specifically the PP, recalling the delicate political situation of its councillors. Representing the PP, Eusebio Monzó has participated, pointing out that some of the new government\'s achievements have been possible thanks to the inheritance left by the PP, with a surplus in the municipal accounts. Monzó has criticized the local government for supporting the dismantling of the single school district, for "ruining" the school vouchers, and for raising property taxes. He has reproached the local government for the partisan use of the balcony. Monzó, who has apologized for corruption, has demanded respect for the traditions and identity of the Valencians. Meanwhile, Fernando Giner from Ciudadanos has emphasized the missed opportunities in this year. He cites the strategic tourism plan, which will not be approved until after the summer, and the impatience growing in El Cabanyal. He also warns about the legal uncertainty caused by the reversal of tourist influx zones. Giner accuses the local government of not being united, wasting time on banners, prioritizing populism, and operating, he said, "blinded by Ribó\'s ideology." The local government presents a positive balance. Jordi Peris for València en Comú clarifies that there is indeed dialogue within the government, and their way of working is being imitated nationally, hence the term "Valencian-style government". Among the challenges are building more social housing and betting on innovation. He highlights the social shift in municipal policies, with social clauses in contracts to support citizens\' rights, and new forms of collaboration with neighbors. Peris acknowledges that changes are slow, but they are happening. Joan Calabuig from PSPV has defended the property tax increase because "those who have more were not the ones paying the most." He states that, despite the criticisms, the city is functioning and that suppliers are being paid within 14 days. Regarding the freedom of commercial hours and school choice demanded by the PP, Calabuig reminds that "it was the PP that benefited from CIEGSA" and that it arbitrarily created four tourist influx zones. Regarding future challenges, he says generating more employment should be the priority. On behalf of Compromís, Pere Fuset has stated that their priority has been the rescue of people and that Valencia is an anti-eviction city thanks to the local government. He criticized the fact that the city\'s image has been tarnished by corruption and that "the PP uses faith and festivities as elements of tension". Regarding future challenges, he calls for the underground tunnel and highlights mobility as a change towards a friendlier city. He requests the collaboration of all Valencians. The debate was moderated by the Chief Editor of Radio Valencia, Julián Giménez, and was recorded in the main studio of TAU, the Audiovisual Workshop of the Universitat de València.'

In [8]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Choose the specific Falcon-Instruct model. For example, "tiiuae/falcon-7b-instruct"
# model_name = "EleutherAI/gpt-neo-1.3B"
model_name = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

inputs = tokenizer(text, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_length=500)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors


La primera vez era el primer caso de la enfermedad de la enfermedad de la UEFA, pero no era a pasar en 1966, pero no hay tres casos positivas. El primero era el jugador de la enfermedad en el UEFA, durante el ao 1974, y en el pasado de 1974, y en el pasado de 1974, hay sólo tres casos positivas. El primero era el jugador de la enfermedad mixed-race enfermedad en el UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el jugador de la UEFA, durante el ju


## Baseline translator

In [8]:
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
model = MarianMTModel.from_pretrained(translation_model_name)
english_text = "What color is the hrass? Answer in english"
input_ids = tokenizer.encode(english_text, return_tensors="pt")
translated_ids = model.generate(input_ids)

translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

print("Translated Text (Spanish):", translated_text)

Translated Text (Spanish): ¿De qué color es la hrass? Respuesta en inglés


## Load dataset

In [9]:
dataset = load_dataset(dataset_name)
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"].shuffle(seed=42).select(range(100))
test_dataset = split_dataset["test"].shuffle(seed=42).select(range(100))

## x/y split

In [10]:
train_x_data = train_dataset["en"]
train_y_data = train_dataset["es"]
test_x_data = test_dataset["en"]
test_y_data = test_dataset["es"]

In [11]:
len(train_x_data[0])

3100

# Load Pre-Trained Models

In [13]:
model = MarianMTModel.from_pretrained(model_name)
bleu = evaluate.load("sacrebleu")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)

You are using a model of type t5 to instantiate a model of type marian. This is not supported for all configurations of models and can yield errors.


ValueError: The state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?

## Load tokenizer

In [14]:
# I used the original text (english) to tokenize the data, seemed logical to me, but idk
def tokenize_function(examples):
    return tokenizer(examples["en"], padding="max_length", truncation=True)

train_x_data = train_dataset.map(tokenize_function, batched=True)
test_x_data = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# Evaluate Model

In [15]:
# Okay this maybe works, maybe not, who knows, help me?
input_ids = torch.tensor(test_x_data["input_ids"]).to(model.device)
print('translating the tokens..')
translated_tokens = model.generate(input_ids=input_ids)
predicted_translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]

references = [[ref] for ref in test_y_data] #The bleu score needs a list of lists
bleu_score = bleu.compute(predictions=predicted_translations, references=references)

print(f"BLEU score: {bleu_score['score']}")

translating the tokens..


KeyboardInterrupt: 