# Clone full repo to copy aditional python files

This cell download all the aditional pythons files that are used in the notebook.

In [None]:
# clone repo and move to current working dir
!git clone https://github.com/evaluating-effectiveness-cloud-nlp/replication_package.git repo
!rsync -av repo/ .
!rm -rf repo

# Installing dependencies with pip

Install dependencies using the file `requirements.txt` downloaded from previous cell.

In [None]:
# installs dependencies
%pip install -r requirements.txt

## Download the pre-trained ``glove.twitter`` word embedding model

This *wordembedding* model is only used in the noise WordEmbeddings.
If you don't want to use it, just remove it from the noise list and don't run this cell.
> Note: The file has been placed in a personal repository just for ease of download, the original model is available as a *.zip file at: https://github.com/stanfordnlp/GloVe

In [None]:
!python -m pip install ipywidgets
import urllib.request
from os.path import exists
import ipywidgets as widgets
from IPython.display import display
import os

progress = None
def show_progress(block_num, block_size, total_size):
    global progress
    if not progress :
        progress = widgets.FloatProgress(
            value=0,
            min=0,
            max=total_size,
            step=0.1,
            description='Downloading',
            bar_style='info',
            orientation='horizontal'
        )
        display(progress)

    downloaded = (block_num * block_size)
    print(block_num * block_size, "/", total_size,"\r", end="")

    progress.value = downloaded

model_path = "models/glove.twitter.27B.100d.txt"
word_embedding_url = "https://huggingface.co/anonymoususer/fault_injection_mlaas/resolve/main/glove.twitter.27B.100d.txt"

file_exists = exists(model_path)

if file_exists :
    print("file ", model_path, " already exists.")
else:
    filename = "models"
    os.makedirs(filename, exist_ok=True)
    urllib.request.urlretrieve(word_embedding_url, model_path, show_progress)
    print("File downloaded!")

## Importing aditional python modules

This cell imports all noise algorithms, data manipulation code, and visualization modules.

In [None]:
from datetime import datetime
from typing import List
from noise_insertion.utils import save_data_to_file
from data_sampling.data_sampling import DataSampling
from noise_insertion.percent_insertion import noises
from noise_insertion import noise_insertion
from visualization import visualization
from progress import progress_manager
from metrics import metrics
import ipywidgets as widgets
data_sampling = DataSampling()


## Parameters

Choose the *sample size*, *types of noise* to be used and *noise levels*:

In [None]:
sample_size = 99

noise_list =[
    noises.Keyboard,
    noises.OCR,
    noises.RandomCharReplace,
    noises.CharSwap,
    noises.WordSwap,
    noises.WordSplit,
    noises.Antonym,
    noises.Synonym,
    noises.Spelling,
    noises.TfIdfWord,
    noises.WordEmbeddings,
    noises.ContextualWordEmbs,
]

noise_level=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

## Running the experiment

In this step, we execute the pipeline defined in section *2.3. Evaluation Process* of our paper.

We follow this steps:

- **Data sampling**: Initially, we extract a sample from the dataset Twitter US Airline Sentiment aiming to create a balanced dataset containing the same number of instances classified as positive, negative, and neutral;
- **Oracle**: After creating the balanced dataset, we use the f-measure to evaluate the effectiveness of the Cloud NLP services by using this dataset;
- **Noise generation**: At this step, we use the tool nlpaug to produce different datasets containing sentences changed according to different levels of noise;
- **Noise Influence**: In this step, we evaluate the effectiveness of the Cloud NLP services by using the datasets with noise.

In [None]:
sentence = ["This is a sample text"]
# keyboard = noises.Keyboard(sentence, 0.1)
# ocr = noises.OCR(sentence, 0.1)
# random_char_replace = noises.RandomCharReplace(sentence, 0.1)
# char_swap = noises.CharSwap(sentence, 0.1)
# word_swap = noises.WordSwap(sentence, 0.1)
# word_split = noises.WordSplit(sentence, 0.1)
# antonym = noises.Antonym(sentence, 0.1)
# synonym = noises.Synonym(sentence, 0.1)
# spelling = noises.Spelling(sentence, 0.1)
# tfidf_word = noises.TfIdfWord(sentence, 0.1)
# word_embeddings = noises.WordEmbeddings(sentence, 0.1)
# contextual_word_embs = noises.ContextualWordEmbs(sentence, 0.1)
for noise in noise_list:
    print(noise(sentence, 0.1))




In [None]:
import pandas as pd

dataset = pd.read_csv("Tweets_dataset.csv")

In [None]:
dataset['text'][0]

### Text Classification
- model: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest
- dataset: https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment

In [None]:
#from data_sampling.data_sampling import DataSampling
from llms.twitter_roberta_base_sentiment import analize_sentence


In [None]:
analize_sentence('This is a nice day')

In [None]:
from database.database import Database

In [None]:
def test():
    query = f"""
        SELECT now()
    """
    #record = (status,)

    database = Database()
    result = database.query(query)

    return result


In [None]:
date = test()

print(date[0]['now'])

In [None]:
# Print labels and scores
sentence = "I do know about this"
sentence_type, positive, negative, neutral = analize_sentence(sentence)
print('Sentence type: ', sentence_type)      
print('Positive: ', positive)
print('Negative: ', negative)
print('Neutral: ', neutral)

### Text to Image
- model: https://huggingface.co/CompVis/stable-diffusion-v1-4
- dataset: https://huggingface.co/datasets/cifar10

In [None]:
!pip install transformers
!pip install ipywidgets
!pip install --upgrade diffusers[torch]
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
import llms.stable_diffusion as sd

In [None]:
sd.gen_image_by_prompt(dir='images', img_name='car', prompt='image of an car')

### Text Generation
- model: https://huggingface.co/openai-community/gpt2
- dataset: 

In [None]:
import llms.gpt2 as gpt

In [None]:
prompt = 'What Am I?'
texts = gpt.gen_text_by_prompt(text=prompt, max_length=144, qnt_sentences=1)
print(texts[0]['generated_text'])


In [None]:
texts

### Sumarization
- model: https://huggingface.co/facebook/bart-large-cnn
- dataset: 

In [None]:
import llms.bart_large_cnn as summarizer

In [None]:
ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
"""
sumarized_text = summarizer.summarize_text(text=ARTICLE, max_length=60, min_length=30)
print(sumarized_text)


### Translation
- model: https://huggingface.co/unicamp-dl/translation-en-pt-t5
- dataset

In [None]:
import llms.translation as translation

In [None]:
translated_text = translation.translate_en_pt("I'd like to eat cake")

In [None]:
translated_text

### Question Answering
- model: https://huggingface.co/deepset/roberta-base-squad2
- dataset: 

In [None]:
import llms.roberta_base_squad2 as qa

In [None]:
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = qa.get_answer(QA_input=QA_input)

In [None]:
res

### Text to Speach
- model: https://huggingface.co/facebook/mms-tts-eng
- dataset: 

In [None]:
import llms.text_to_speach as tts 

In [None]:
text = "I hope you make some cookie"
tts.gen_audio_from_text(text=text, dir='audios', file_name='audio_test')

### Text to Audio
- model: https://huggingface.co/facebook/musicgen-medium
- dataset: 

In [1]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration

processor = AutoProcessor.from_pretrained("facebook/musicgen-medium")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-medium")


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]



In [2]:
inputs = processor(
    text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
    padding=True,
    return_tensors="pt",
)

audio_values = model.generate(**inputs, max_new_tokens=256)

In [3]:
from IPython.display import Audio

sampling_rate = model.config.audio_encoder.sampling_rate
Audio(audio_values[0].numpy(), rate=sampling_rate)