In [1]:
!pip install python-dotenv
!pip install -U torch transformers
!pip install bitsandbytes
!pip install datasets
!apt install git-lfs
!pip install accelerate
!pip install nltk
!pip install evaluate
!pip install rouge_score

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [2]:
#Get acccess token from Hugging Face, and agree to conditions of use for Gemma

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Gemma Model

In [3]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

#Load the model, its corresponding tokenizer, and quantization config

model_id = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16)

#Use quantization config if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"using {device}")

if device == 'cuda':
  model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="auto",
                                             quantization_config=quantization_config
                                             )
else:
  model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="auto"
                                             )
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

using cpu


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

In [4]:
#This function is to prepare the prompt and do inference using the Gemma model

def inference(question: str, context: str):

    if context == None or context == "":
        prompt = f"""Give a detailed answer to the following question. Question: {question}"""
    else:
        prompt = f"""Using the information contained in the context, give a detailed answer to the question.
            Context: {context}.
            Question: {question}"""
    chat = [
        {"role": "user", "content": prompt}
    ]
    formatted_prompt = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True,
    )
    inputs = tokenizer.encode(
        formatted_prompt, add_special_tokens=False, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=100,
            do_sample=False,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = response[len(formatted_prompt) :]  # remove input prompt from reponse
    response = response.replace("<eos>", "")  # remove eos token
    return response

In [None]:
#Try the generation of Gemma with one question

question = "What is a good AI research idea?"
print(inference(question=question, context=""))

**A good AI research idea should meet the following criteria:**

**1. Relevance:**
* The research should address a significant problem or gap in the field.
* It should have potential real-world applications or impact.

**2. Feasibility:**
* The research question should be well-defined and have a clear path to achieving the desired outcome.
* The required data and resources should be accessible and cost-effective to acquire.

**3. Impact:**
* The research


# Evaluate the model on a RAG dataset

In [None]:
#In order to see how evaluate the generation results of the Gemma model, I get a RAG dataset from HuggingFace

from datasets import load_dataset

datasets = load_dataset("neural-bridge/rag-dataset-12000")

Downloading readme:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9600 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2400 [00:00<?, ? examples/s]

In [None]:
#This dataset has context, question, and answer, and I can estimate how good Gemma's answer is compared to the "ground truth" answer

datasets

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9600
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2400
    })
})

In [None]:
#This function is to view some rows of the dataset

from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(datasets["train"])

Unnamed: 0,context,question,answer
0,"Get Involved\nCome join EATS and be part of nurturing a generation of healthy eaters with rock-solid lifelong health and wellness habits. We have a variety of programs that are taking off in our schools and we need positive adult role models to help make these programs happen! Share your passion for healthy food, kids, and education by volunteering with EATS for one of the following programs:\nCooking Classes\nDo you want to help Park City students take ownership over their healthy food choices? Our highest need for volunteers is at our elementary and middle school cooking classes. Come assist our instructors in providing the most enriching, nutritional, education experience we can offer to Park City youth. As a volunteer at our cooking classes, you’ll get the chance to prepare and eat a nourishing recipe made from scratch with students. No prior cooking experience necessary - just bring yourself and a positive attitude! We’ll provide the aprons, food, and equipment.\nSchool Gardens\nDo you believe that students should understand where their food comes from? Studies show that children who grow their own food are more likely to eat fruits and vegetables as an adult. Gardening also helps students learn valuable social and interpersonal skills that translate in and out of the classroom. Our indoor hydroponic systems mean that our garden program runs year round! Come get your hands dirty by volunteering to work at one of our schools' gardens.\nEvents\nHeading to a community event in Park City? Chances are EATS will be there with a table or booth! Volunteer with EATS at community events like Hike4Hunger or film screenings to help spread the word about EATS’ work in the community.\nConsult with Us!\nEATS relies on a network of professionals in Park City and beyond to help us operate at our best. From nutritionists, professional chefs, event planners, and educators to those who work in the financial services sector. They serve on our board committees - Events, Curriculum, Marketing, Development - or provide general consul on our work. If you are interested in joining a committee or learning more, email meaghan@eatsparkcity.org .\nVolunteer with EATS",What are some of the programs that EATS offers where one can volunteer?,"Some of the programs that EATS offers where one can volunteer include Cooking Classes, School Gardens, Events, and Consultation."
1,"The Huffington Post, July 3, 2013\nTurtle Derbies: A Race Our Turtles (and Children) Can't Win\nBy Collette Adkins Giese\nEvery summer in small towns across America, children search woods and waters for turtles to compete in annual turtle races, longstanding traditions that often raise money for important causes.\nBut our nation's turtles are now dying off at alarming rates never-before seen, and it's time to end this tradition, which threatens to spread deadly diseases to both wild turtles and children.\nAfter the races are over, most people release their turtles into the wild, undoubtedly thinking that no harm has been done. But races can expose turtles to disease, which then can spread to wild populations when turtles are released.\nThreats like overexploitation and habitat loss have caused dramatic population declines in almost all U.S. turtle species, with many now either protected as endangered under the Endangered Species Act or under consideration for such protection. A new and emerging threat is a deadly wildlife disease called ranavirus that has caused turtle, frog and salamander die-offs in more than 25 states.\nRanavirus infection has been confirmed in Maryland in wild eastern box turtles, a popular entrant for turtle races in the Northeast. One confirmed case of the disease showed up last summer in Harford County, home of the Bel Air Annual Turtle Derby -- one of Maryland's largest turtle races.\nTurtle races can also spread Salmonella to people who handle turtles, including young children who can become severely ill or even die. Salmonella are naturally occurring bacteria in turtles, and infected turtles usually do not appear sick in any way.\nThat's why the Center for Biological Diversity has asked the sponsor of the annual turtle race in Bel Air, Maryland to stop using wild caught turtles. The Maryland Department of Natural Resources also issued a statement advising against using wild caught turtles in races.\nDespite the risks, the Bel Air Turtle Derby has explained that they will go ahead with their turtle race as planned on the Fourth of July. But hopefully this year will be the last time that wild turtles are used.\nTurtle races strain native turtle populations that are already under terrible pressure from habitat loss, road kill and other threats. In fact, a recent study found that nearly half of all turtle species are at risk of extinction. Some turtle species have lost more than 95 percent of their historic habitat.\nWhile many of the threats facing wild turtles are difficult problems to solve, the threat posed by turtle races has an easy fix: just stop using wild caught turtles.\nMany small towns across the country have replaced their turtle races with wildlife-friendly festivals that use creative substitutes for wild caught turtles, such as river races using rubber turtles or races where people pull toy turtles on strings. The John Hopkins Turtle Race in Baltimore does not use any wild caught turtles, instead racing turtles brought in from U.S. turtle farms, to which the turtles are returned.\nFor generations, that magical moment when many a child first understood our bond with all things wild was kindled by staring into the eyes of a small turtle as it crawled across their outstretched hand.\nThat bond now requires that we put our own traditions aside and do what's best for our turtles, before it's too late.\n© 2013 TheHuffingtonPost.com, Inc.\nThis article originally appeared here.",What are the potential negative impacts of turtle races on both turtles and humans?,"Turtle races can expose turtles to diseases, which can then spread to wild populations when the turtles are released. This can cause population declines in turtle species. Additionally, turtle races can spread Salmonella to people who handle turtles, including young children who can become severely ill or even die."
2,"An IOS app that calculates the correct exposure for a pinhole camera.\nIOS: pop-up-pinhole-exposure-calculator\nThe Pop-Up Pinhole Co. website\nAventur Wealth needed a pre-sales app to explain financial options to clients. The app is a web app, entirely created in HTML5 and is installed on iPad Pro and Microsoft Surface Pro platforms.\nAventur Wealth website\nCalculator that shows how much money can be saved by using low-energy bulbs.\nIOS: Low Energy Calculator\nLow Energy Supermarket website\nApplication on IOS to control a sophisticated lighting installation using Bluetooth.\ninklighting app\nInklighting website\nApplication on IOS and Android for children’s potty training.\nIOS: Potty Training App\nAndroid: Potty Training App\nPotty Training Academy website\nPre-sales iPad application, one of three, produced with extensive animation in multiple languages.\nPrivate application on iPad and Windows tablets for exhibition spaces.",What is the purpose of the app created by Aventur Wealth?,The app created by Aventur Wealth is a pre-sales app to explain financial options to clients.
3,"TriNation Escape and hushed their way through wildlife sanctuaries in the Wild Escape. Needless to say, our vehicles have a lot of stories to tell.\nThe Mahindra Adventure fleet comprises of various Mahindra 4WDs. But it is the Mahindra Scorpio VLX 4WD that forms the backbone of our fleet. When you arrive to drive, it is the Mahindra Scorpio that becomes your home for the next few days. The easiest way to recognise Mahindra Adventure vehicles is by the characteristic orange stripes of the Mahindra logo on the sides. And if you see one with headlights on, watch out because the expedition convoy is on the way.\nMahindra Thar CRDe 4x4\nThe Mahindra Thar combines the legendary off-road ability of the MM540 with the modern drivetrain of the Mahindra Scorpio. You will usually find them in the front and back of the convoy ensuring your safety and providing some vital automotive eye candy throughout the expedition. They’ve been customised appropriately for the kind of rough use they go through. We promise when you look at our Thars, you’ll want one. We also have specially prepared Mahindra Thars that you can drive at the Mahindra Adventure Off-Road Training Academy.\nMahindra Scorpio VLX 4WD\nThe Scorpio forms the backbone of our fleet and these are the ones you pilot on our expeditions. All our Scorpios are top-end VLX.\nMahindra Bolero SLX 4x4\nThe ubiquitous Mahindra Bolero is no stranger to India. You’ve seen them running around in every corner of the country. But our Boleros are special, first they are 4x4s which are rare and they are SLX trim 4x4s which aren’t available any more so they are a little exclusive too. The Boleros do duty as support vehicles in the Great Escapes and the Mahindra Adventure Rally team. They are also used on shorter expeditions like Authentic Goa where you could get a chance to behind the wheel of this truly rugged workhorse.\nMahindra XUV5OO W8 AWD\nThe cheetahs as we lovingly call them are used for official duties during the expeditions. All our XUV5OOs are top-of-the-line W8 AWD. We knock them about not only during the expeditions but also follow the rally routes during the Indian Rally Championship (IRC) and other raid-rally events. 140bhp worth of mHawk ensures we get there almost as quickly as our rally cars.\nMahindra Getaway 4WD.\nMahindra SuperXUV5OO\nYou’ll hardly ever see them unless you open a newspaper and see their exploits in rallies around the country. The current INRCSUV champion has been running rings around lighter, petrol-powered competition ever since it hit the scene. It may slightly resemble the domesticated XUV5OO that you’ll see during the expedition, but underneath these special cheetahs have lots of goodies. If you ever see a blur of orange and silver on a rally stage, it’s probably a SuperXUV5OO on its way to the podium. These mean machines are piloted by the best drivers in the business – Gaurav Gill, Sunny Sidhu and Amittrajit Ghosh.\nMahindra Navistar Truxo 25.\nSpecial Vehicles Unit\nThe special vehicles unit comprises of vehicles which we use during expeditions, rallies and off-road events. These vehicles have their own names and personalities and are far from standard. We lovingly maintain them and keep them away from prying eyes. Ask for their keys at your own risk. We’re a nasty bunch when it comes to these vehicles.\nAdventure One\nThis is what the Chief of Adventure Initiatives – Bijoykumar.Y drives. It’s a 2011 Mahindra Thar fitted with a rally cabin. Finished in matt British racing green and some tasteful white stripes along with purposeful Maxxis Bighorn 764s, Adventure One not only looks the part but also goes like stink. Mostly found in lead position during expeditions. If you see this around, rest assured ‘chief’ has one eye on her.\nBaby\nBaby is one of the newest Mahindra Thars. Made its first appearance on the pages of BBC TopGear magazine in 2012. She’s built not only to off-road but also lead expeditions. Chipped, lifted and equipped with a hardtop and locking rear differential, Baby is more than capable of climbing lots of stuff you’d only dream of otherwise. Maintained by Vinod Nookala – Brand Manager Thar, she can be spotted pulling people out of trouble during the numerous Mahindra Great Escapes around the country.\nBlackie\nA veteran of the Himalayas, this 2011 Mahindra Thar and has the title of being the most tweaked Thar in India today. We lovingly call her Blackie thanks to her matte black paintjob. Most of her bodywork is fibreglass. Apart from the regular chipping, lifting, she’s also the only Thar in India with a coil spring rear suspension. She’s one of the few Thars that run on road-biased tyres since her duty is only to lead the convoys to Ladakh. We have a hardtop on her as well as a rally exhaust. So she’s loud, so loud that you have to wear ear plugs when you drive. Ask Manish Sarser – Head of the Off-Road Training Academy who swears he hears less on one side. Do anything but don’t ask him for the keys.\nSuper Thar / Thar mHawk\nThese two are rarely seen. The Super Thar is the rally version of the Thar. Much on the same lines as the SuperXUV5OO. She runs rally-spec suspension along with many other goodies which we can’t tell you about. Because we’d have to kill you if we told you. The Thar mHawk is also one of a kind. Don’t start jumping, it’s not coming into production…yet. Both are finished in traditional orange and white, consider yourself lucky if you ever get to see one of these.",What is the main vehicle used in the Mahindra Adventure fleet?,The main vehicle used in the Mahindra Adventure fleet is the Mahindra Scorpio VLX 4WD.
4,"Articles Posted in Contracts\nE.T. Products, LLC v. D.E. Miller Holdings, Inc.\nDoug Miller and his son signed a broad noncompetition agreement when Doug sold his fuel-additives business, E.T., in 2011. Doug sold his other company, Petroleum Solutions, to Kuhns about a year later. E.T.’s new owners sued the Millers for breaching the noncompete by providing assistance to Kuhns as he learned the Petroleum Solutions business. The Millers claimed the noncompete was overbroad and unenforceable and that their assistance to Kuhns came at a time when Petroleum Solutions was E.T.’s distributor, not its competitor. When E.T. severed its relationship with Petroleum Solutions in 2012, Doug told Kuhns that the noncompetition agreement prevented further help and ceased assisting him. On summary judgment, the district judge held that the noncompetition agreement was enforceable but the Millers did not breach it. The Seventh Circuit affirmed, agreeing that the contract was not overbroad, but that the Millers did not breach it. A company’s distributor is not its competitor, so the Millers’ assistance to Kuhns in 2012 was ""fair game."" The contract, read reasonably, did not require Doug to break his preexisting lease with Kuhns. View ""E.T. Products, LLC v. D.E. Miller Holdings, Inc."" on Justia Law\nPosted in: Business Law, Contracts\nRight Field Rooftops, LLC v. Chicago Cubs Baseball Club, LLC. View ""Right Field Rooftops, LLC v. Chicago Cubs Baseball Club, LLC"" on Justia Law\nLand O’Lakes, Inc. v. Ratajczak\nFrom 2006-2012 Packerland deceived at least one of its customers about the protein content of its Whey Protein Concentrate. Land O’Lakes purchased Packerland’s protein concentrate for use in making foods for calves and other young animals. Buyers infer protein levels from measuring nitrogen: a seller can add another nitrogen-rich substance to produce higher scores. The Ratajczaks, who owned Packerland, started adding urea to its protein concentrate. in 2006. Land O’Lakes suspected that the concentrate was high in nonprotein nitrogen but could not learn why; the Ratajczaks made excuses that Land O’Lakes accepted. The Ratajczaks sold Packerland in 2012. The new owner kept them as employees; they kept adding urea until the buyer learned what the truth. The Ratajczaks lost their jobs and settled for about $10 million before the buyer filed a complaint. Land O’Lakes stopped buying Packerland’s product and asserted claims of breach of contract, fraud, and violation of the Racketeer Influenced and Corrupt Organizations Act. Packerland’s insurers refused to defend or indemnify it or the Ratajczaks; the Ratajczaks’ personal insurer refused to indemnify them for their settlement with Packerland’s buyer. The district court dismissed Land O’Lakes’s suit and ruled in favor of the insurers. The Seventh Circuit affirmed, rejecting Land O’Lakes’ claim to treble damages under RICO and state-law and the Ratajczaks’ claims that Packerland’s insurers and their own insurers had to defend and indemnify them. View ""Land O'Lakes, Inc. v. Ratajczak"" on Justia Law\nPosted in: Agriculture Law, Contracts, Criminal Law, Insurance Law\nAndy Mohr Truck Center, Inc. v. Volvo Trucks North America\nIn 2010, Mohr and Volvo entered into a heavy truck dealership agreement. In 2012, Volvo sought a declaratory judgment that it was entitled to terminate Mohr’s dealership because Mohr had misrepresented that it would build a new long‐term facility for the dealership. Mohr complained that Volvo had violated Indiana’s Franchise Disclosure and Deceptive Franchise Practices Acts by promising to award Mohr a Mack Truck dealership franchise, which would have justified Mohr’s investment in the new facility. Volvo gave the Mack franchise to another company. Mohr also accused Volvo of providing more favorable concessions on truck pricing to other franchise dealerships through its Retail Sales Assistance program. The district court granted summary judgment, holding that the integration clause in the dealer agreement barred the new‐facility claim and the Mack franchise claim. Following a trial on the unfair discrimination claim, a jury awarded Mohr $6.5 million. The Seventh Circuit reversed the award for the unfair discrimination claim and affirmed the summary judgment rulings. The court stated that these were sophisticated parties, bound by the integration clause and that Mohr did not establish unfair discrimination with respect to price concessions. View ""Andy Mohr Truck Center, Inc. v. Volvo Trucks North America"" on Justia Law\nPosted in: Business Law, Contractsaley v. Kolbe & Kolbe Millwork Co.. View ""Haley v. Kolbe & Kolbe Millwork Co."" on Justia Law\nPosted in: Class Action, Contracts, Insurance","What was the outcome of the case between E.T. Products, LLC and D.E. Miller Holdings, Inc. regarding the noncompetition agreement?","The district judge held that the noncompetition agreement was enforceable but the Millers did not breach it. The Seventh Circuit affirmed this decision, agreeing that the contract was not overbroad, but that the Millers did not breach it."
5,"Potatoes are a nutritious starchy vegetable.\nThe way you prepare your potatoes will determine how nutritious they are, but all types of potatoes can be a healthy source of fiber and essential nutrients when prepared and served with little or no added fat. Different types of potatoes have slightly different nutritional compositions, but sweet potatoes have a slight advantage since they provide higher amounts of a number of essential vitamins and minerals.\nMacronutrients\nA 100-gram serving of baked white potatoes, red skin potatoes or sweet potatoes with skin contains between 89 and 94 calories and provides 2 grams of protein and 20 to 21 grams of carbohydrate. Both protein and carbohydrates provide your body with energy, and protein also provides essential amino acids for forming new cells in the body. White potato and red skin potato provide 2 grams of fiber per serving, and sweet potato provides 3 grams of fiber per serving toward the recommended daily fiber intake of between 21 and 38 grams, depending on your age and gender. Fiber is a type of carbohydrate that is important for healthy digestive function and can lower your risk for obesity, diabetes, high cholesterol and heart disease.\nVitamins\nSweet potatoes provide 19,218 international units of vitamin A per 100-gram serving, which is 440 percent of the daily value for vitamin A. This is significantly more than white or red skin potatoes, which contain only 10 international units. Vitamin A is essential for healthy vision, cell division and immune function. Sweet potatoes also provide more vitamin C, with a serving providing 20 milligrams, or 40 percent of the DV, compared to the 13 milligrams found in a serving of white or red skin potatoes. Vitamin C is an antioxidant that is needed for building and repairing cells in your body. All three types of potato contain 4 percent of the DV for folate. Sweet potatoes contain more riboflavin, providing 8 percent of the DV compared to the 2 percent of the DV in white and red skin potatoes.\nMinerals\nSweet potatoes contain more calcium than red skin and white potatoes, but red skin and white potatoes provide more magnesium. A 100-gram serving of sweet potatoes contains 4 percent of the daily value for calcium and 6 percent of the daily value for magnesium, compared to the trace amounts of calcium and 8 percent of the daily value for magnesium provided by red skin and white potatoes. Calcium and magnesium are both important for strong bones and nerve and muscle function. All three types of potatoes contain about the same amount of phosphorus, zinc and iron, providing 6 percent of the daily value for phosphorus and 2 percent of the daily value for zinc and iron per 100-gram serving.\nConsiderations\nBaking is one of the healthier methods for cooking potatoes. Instead of butter or sour cream, consider topping your potato with vegetarian chili, yogurt or broccoli with low-fat cheese. Eat the skin as well as the flesh, as this is the most nutrient-dense part of the potato. Avoid eating any part of the potato that has green skin or new growth, as these parts contain a bitter toxin.\nArticles For Your Diet\nReferences (7)\n- USDA Nutrient Data Laboratory: Nutrient Data for 11357, Potatoes, White, Flesh and Skin, Baked\n- USDA Nutrient Data Laboratory: Nutrient Data for 11358, Potatoes, Red, Flesh and Skin, Baked\n- USDA Nutrient Data Laboratory: Nutrient Data for 11508, Sweet Potato, Cooked, Baked in Skin, Without Salt\n- Texas A&M Extension: What Is the Difference Between a Sweet Potato and a Yam?\n- Academy of Nutrition and Dietetics: Is Potato Skin the Most Nutritious Part of Potatoes?\n- University of Florida Extension: Growing Potatoes in the Florida Home Garden\n- Academy of Nutrition and Dietetics: Bring Back the Potato\nPhoto Credits\n- Sweet Potatoes and New Potatoes in Baskets image by bawinner from Fotolia.com",What are the nutritional benefits of sweet potatoes compared to white and red skin potatoes?,"Sweet potatoes provide higher amounts of a number of essential vitamins and minerals compared to white and red skin potatoes. They contain more fiber, vitamin A, vitamin C, and riboflavin. They also contain more calcium than red skin and white potatoes. However, red skin and white potatoes provide more magnesium. All three types of potatoes contain about the same amount of phosphorus, zinc and iron."
6,"Ways to make money. The psychology of making money.\nContent\n- How to earn money online\n- 25 Ways to Make Money Online and Offline - NerdWallet\n- 10 Uncommon Side Hustles To Make Money (2020)\n- 50 ways to make money\n- 50 ways to make money - Which?\n- 70 Creative Ways To Make Extra Money + How Much\n- 32 Proven Ways to Make Money Fast\n- How to Make Money Online: 28 Real Ways to Earn Money Online\nIn a bind? Need some fast cash?\n.\nOur thoughts are incredibly powerful. We have to be stewards of our thoughts. Once you find yourself thinking negatively, you need to jar yourself out of that.\n- All reviews of dealing centers\n- Minute strategy for binary options\n- Blogs are one of the most popular ways to make money because they required little upfront costs and have huge upside potential.\n- From online poker to selling your collection of Beanie Babies, there are lots of popular get-rich-quick, money-making ideas that always pop up.\n- Statistics and trading robots\n- Reports CME options.\nHow to earn money online\nWe have voices in our mind, and it's up to the ego to decide which one to fulfill. Its goal is to satisfy the id in some way while also attending to the super-ego. The ego's job is complete when it sees something like this. Ways to make money wary. Listen to the conversation deep within the confines of your mind and do your best to tame ways to make money proverbial beasts. What are the best ways to make money right away?\nWhile a search online when you're in need of some fast cash will produce millions of results, not all will be legitimate.\n25 Ways to Make Money Online and Offline - NerdWallet\nIt's up to you to filter through the so-called noise. These 32 strategies will help put you in the black, even if it's in a very small way.\n10 Uncommon Side Hustles To Make Money (2020).\n50 ways to make money. The mechanics don't matter much.\n50 ways to make money - Which? ways to make money.\n70 Creative Ways To Make Extra Money + How Much.\n32 Proven Ways to Make Money Fast option structure and strategies but requires no overhead.\nHow to Make Money Online: 28 Real Ways to Earn Money Online\nItems are printed and delivered on-demand. You simply receive a share of ways to make money proceeds.\n.\n- One touch in binary options\n- Option yield charts\n- Но тот молчал.\n- Честь.\n- What is a vendor option\n- Bitcoin price real time chart.\n- Make money on the Internet without investing in comments\n- Local bitcoin localbitcoins vhod.",What are some of the ways to make money online mentioned in the context?,"Some of the ways to make money online mentioned in the context include blogging, online poker, selling collections of Beanie Babies, and participating in on-demand printing."
7,"Posted By: SHRM\nFederal Legislative Alert\nWe need your help! Please e-mail your U.S. Representative and ask him or her to VOTE YES on the comp time bill, H.R. 1180.\nIn February, Representative Martha Roby (R-Ala.), introduced the Working Families Flexibility Act of 2017 (H.R. 1180) also known as the “comp time” bill. The proposal will remove a federal restriction on the private-sector and help Americans better address family and work needs. H.R. 1180 would permit – but would NOT require – private-sector employers to offer employees the choice of taking overtime in cash payments as they do currently or in the form of paid time off – or comp time.\nThe House Subcommittee on Workforce Protections held a legislative hearing on H.R. 1180 on April 5th. SHRM members Leslie Christ, Chief Resource Officer for WellStone Behavioral Health, and Crystal Frey, Vice President of Human Resources for Continental Realty Corporation, testified at the hearing in support of the bill. H.R. 1180 is expected to go before the full House of Representatives for a vote the week of May 1st.\nSHRM supports H.R. 1180 and we need your help! Please e-mail your U.S. Representative and ask him or her to vote “YES” for H.R. 1180 when the proposal goes to the House floor. Read SHRM’s position paper on the comp bill.\nPLEASE TAKE THIS ACTION\nE-mail your Representative’s Washington, DC office using SHRM’s Policy Action Center by following these fast and easy steps:\n- Visit SHRM’s Policy Action Center\n- Fill out your contact information (be sure to enter your home mailing address since the system will correctly match you to your federal congressional district).\n- Personalize the template email language to include your message with your own story.\n- Click “submit.”\nIt is critical that Representatives hear from employers in their congressional districts over the next five days leading into the week of May 1st. That’s why we are asking you – HR professionals, to e-mail your House members and express support for the comp time legislation.\nBACKGROUND\nThe Fair Labor Standards Act (FLSA) of 1938 requires that hours of work by non-exempt employees beyond 40 hours in a seven-day period must be compensated at a rate of 1 1/2. 1180 1/2 hours for each hour of overtime worked. Employees would be able to accrue up to 160 hours of comp time per year, although an employer could choose to “cash out” the comp time after 80 hours after providing the employee with 30 days of notice. An employer would also be required to cash out any unused comp time at year’s end at the regular time and a half rate.\nFor questions regarding compensatory time and Rep. Roby’s comp time bill, contact Patrick Brady, SHRM’s Senior Government Affairs Advisor, at Patrick.Brady@shrm.org or 703-535-6246.\nIf you encounter any issues contacting your member of Congress, please contact Meredith Nethercutt, SHRM’s Senior Associate, Member Advocacy and A-Team Program Director, at Meredith.Nethercutt@shrm.org or (703) 535-6417.",What is the purpose of the Working Families Flexibility Act of 2017 (H.R. 1180) also known as the “comp time” bill?,The purpose of the Working Families Flexibility Act of 2017 (H.R. 1180) also known as the “comp time” bill is to remove a federal restriction on the private-sector and help Americans better address family and work needs. It would permit – but would NOT require – private-sector employers to offer employees the choice of taking overtime in cash payments as they do currently or in the form of paid time off – or comp time.
8,"Hey it is great to be connected, and it is great to be local.\nI am local\nI am part of a local neighbourhood, a small community, a large city, a south pacific region, a hemisphere and strangely enough, yes, I am local.\nSupporting local starts with my small community. You see we live rurally, we have a small block of shops – a butcher, 2 dairies/corner store, a café, hairdresser, 2 bakeries, chemist, post shop, petrol station and a pub – yep, that is about it!\nImagine this…\nIf all of us in our small community went to the heart of Auckland city for haircuts, our petrol, our prescriptions etc… the ultimate result would be our ‘local’ shops and enterprises would close.\nNow, ponder the concept in our beautiful industry….\nAnswer this question – If none of the locals shopped in your store would your business survive?\nFor most of us in retail, unless we have an incredible online presence, and one that is globally recognised and very successful – the answer is NO!\nSo we can all agree that ‘Local’ is important right!\nNext question – Look at your stock and your supply chain …\nHow much of it is local?\nYes, from your town, your city, your country, your hemisphere – New Zealand and Australia..\nDo you make a conscious effort to support suppliers from your own community/country first?\nHave you ever thought about considering the ‘local’ supplier over a big international player?\nWhat happens if we don’t support ‘local’ – easy, they disappear! And, if the locals don’t support you, then the chances are you will disappear.\nThere are 3 reasons why I love LOCAL:\n- I can build a personal relationship with my customers and my supply chain. They understand my market and my business model.\n- It is more cost and time effective to deal with local suppliers – I don’t have the same freight charges, customs, import costs and time delays with delivery.\n- It is a two way street – I support local and local support me.\nThe surprising fact is that when I have made a conscious effort to look for local suppliers, I have found in a number of cases something more brilliant than I could have found by going off shore.\nBut, then I am an individual, I don’t need to follow what everyone else does. I don’t need to have the same, look and behave the same, I am comfortable in my skin being different. I love being individual, different and of cause I love being ‘local.\nI want to aggravate and agitate your thought process around ‘Local’…\nSo, how ‘local’ are you?\nAre your shoppers ‘local’?\nHow dependent are you on ‘local’ for your revenue stream?\nDo you support your ‘local’ community?\nDo you engage with your ‘local’ supply chain?\nDo you choose ‘local’ over international suppliers?\nHere is one of my challenges to you for 2018:\n- When you need a supply chain – think ‘local’ first\n- When you work on growing your business, think local first – what does your local community want and where would they most appreciate the supply from\n- Make contributing to ‘local’ part of your DNA – give back to your local communities\nFor me, yes – proud to be ‘Local’\nSee you in your local area sometime soon!\nThought for today:\n-Kevin Kruse\nFondest Regards\nHelen",What are the three reasons the author loves being 'local'?,"The author loves being 'local' because they can build a personal relationship with their customers and supply chain, it is more cost and time effective to deal with local suppliers, and it's a two-way street where they support local and local supports them."
9,"Just in time for Valentine’s Day, I’ve learned why I’m so bad at romance.\nApparently, to succeed, you have to work really hard.\nThat’s the word from three romance novelists, all of whom appeared Saturday at the Camden County Library in Voorhees.\nI interviewed the authors, expecting to chat about ripped bodices, heaving bosoms and throbbing ... um ... passions. But I quickly learned they were committed to a labor of love — with the emphasis on labor.\n“There was a time when writers could just sit down and write,” said Tina Gabrielle of Cinnaminson, who’s set five romances in 19th century England. “Today, we have to have blogs and we tweet and we Facebook. We try to promote the book without forcing it down somebody’s throat.”\n“It’s almost like having two full-time jobs,” added Kathleen Long, a Cherry Hill author of 15 books.\nIndeed, these days, writers don’t just craft fictitious characters.\nConsider the website for Terri Brisbin of Berlin Borough, who’s written more than 30 books about love in long-ago Scotland. It shows a castle rising dramatically in a Highlands setting, the scene tinted purple by a gathering dusk. Two gleaming swords stretch beneath Brisbin’s name and what might be a racy royal motto: “Compelling, sexy historical romance.”\n“An author today is a platform and a brand,” explained Brisbin. “It’s a career, not a hobby.”\nNow, I didn’t have time to read their books, but the titles are just great.\nBrisbin’s “Yield to the Highlander” suggests an intriguing twist to traffic rules in Glasgow. And a series by Gabrielle proceeds from “In the Barrister’s Chambers” to “In the Barrister’s Bed.” Up next, I bet: “Watching the Barrister Sleep.”\nThe covers are pretty dramatic, too.\nFor instance, I hadn’t realized Scotland’s climate is so mild that men there can wear kilts and nothing else — unless you count the occasional straps that barely restrain their rippling muscles.\n“There’s nothing wrong with a man in a kilt,” declared Brisbin, 58, who’s sold more than 1.7 million copies of her works. “Clearly it resonates with women.”\nBrisbin has repeatedly visited Scotland to ensure her books are authentic. On her latest trip, in 2009, she traveled alone through the countryside for more than three weeks.\nI’d like to think at some point her car broke down on a lonely moor. Then, a rugged but sensitive fellow emerged from the heather to help guide her on the path to self-discovery. Call that one “The Kilt Whisperer.”\nGabrielle — a former mechanical engineer who also worked as an attorney before her first book debuted in 2009 — conducts much of her research in libraries and online. “I’ve traveled a lot, but I’ve never been to London in the 1800s,” she explained.\nHer website notes Gabrielle’s offered workshops to aspiring writers, giving tips on how to pitch a book and land a contract. One of her topics would not interest romance-novel coverboys: “How to use conflict to get past the dreaded sagging middle.”\n“There are even workshops we can take on how to write and think like a man,” said Gabrielle, 42. OK, but you could just drink heavily and get the same effect.\nThe writers say their genre has a few guidelines — heroes should be underdogs; heroines, plucky and determined — but only one unbreakable rule.\nLove conquers all.\n“It has to have the happily ever after — always. That’s what women want,” said Gabrielle.\nBut even that commandment might be tweaked under the right circumstances, suggests Brisbin. “Generally, if they end up dying, it’s not a romance — unless they’re dead and paranormal.”\nThe workload becomes even more challenging when an author’s own romance results in children.\n“I used to be a morning writer,” said Long, 50, whose schedule changed with her daughter’s birth eight years ago. “I’ve learned to write at night — and my brain is definitely a morning brain.”\nGabrielle, who has two young daughters, shares that view. At her website, the writer says she’s learned “to grab time and use it when I can” on nights and weekends.\n“That being said, I do have a publishing contract (that) requires me to complete a 100,000-word manuscript ... in less than a year.”\nIt all sounds so grueling I’m tempted to cry out: “Wake up and smell the tea, m’lady! Too much ardor can make ye a martyr!“\nBut only a guy would think less is amour.\n“I wouldn’t be writing romance if I didn’t love it,” Gabrielle assured me. “It’s been happily-ever-after, at least so far.”\nJim Walsh’s column runs Mondays. Reach him at jwalsh@gannett.com.\nIf you go\n• A romance author event will be held at 1 p.m. Saturday at the Vogelson Regional Branch Library, 203 Laurel Road, Voorhees. The program will include chocolate tasting and door prizes. Register online at or call (856) 772-1636.\nJoin the Conversation\nTo find out more about Facebook commenting please read the Conversation Guidelines and FAQs",What is the one unbreakable rule in the genre of romance novels according to the authors?,The one unbreakable rule in the genre of romance novels is that love conquers all and it has to have the happily ever after.


In [None]:
#Taking an example from the dataset

question = datasets['train'][0]['question']
context = datasets['train'][0]['context']
answer = datasets['train'][0]['answer']

print(question +"\n\n"+context+"\n\n"+answer)

What is the Berry Export Summary 2028 and what is its purpose?

Caption: Tasmanian berry grower Nic Hansen showing Macau chef Antimo Merone around his property as part of export engagement activities.
THE RISE and rise of the Australian strawberry, raspberry and blackberry industries has seen the sectors redouble their international trade focus, with the release of a dedicated export plan to grow their global presence over the next 10 years.
Driven by significant grower input, the Berry Export Summary 2028 maps the sectors’ current position, where they want to be, high-opportunity markets and next steps.
Hort Innovation trade manager Jenny Van de Meeberg said the value and volume of raspberry and blackberry exports rose by 100 per cent between 2016 and 2017. She said the Australian strawberry industry experienced similar success with an almost 30 per cent rise in export volume and a 26 per cent rise in value to $32.6M over the same period.
“Australian berry sectors are in a firm positi

In [None]:
#Do inference using the question and context

infer = inference(question=question, context=context)

print(infer)

The Berry Export Summary 2028 is a comprehensive map that outlines the current position and future prospects of the Australian strawberry, raspberry and blackberry industries. It is a valuable resource for growers, exporters and other stakeholders in the berry industry to understand the opportunities for growth and development in the global market.

The purpose of the Berry Export Summary 2028 is to provide a clear and concise overview of the industry's current and future landscape, including:

- High-income countries with


In [None]:
#Importing nltk to calculate BLEU score

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#Function to compute BLEU score

from nltk.translate.bleu_score import sentence_bleu

def compute_bleu(reference, hypothesis):
    # Tokenize the reference and hypothesis sentences
    ref_tokens = nltk.word_tokenize(reference.lower())
    hyp_tokens = nltk.word_tokenize(hypothesis.lower())

    # Compute BLEU score
    bleu_score = sentence_bleu([ref_tokens], hyp_tokens)

    return bleu_score

# Calculate BLEU score to compare the answer with the inference result
bleu_score = compute_bleu(answer, infer)
print(f"BLEU Score: {bleu_score:.4f}")

BLEU Score: 0.1939


In [None]:
#It looks like BLEU score is 19%. We can see the answer and inference side by side below

print(answer)

print(infer)

The Berry Export Summary 2028 is a dedicated export plan for the Australian strawberry, raspberry, and blackberry industries. It maps the sectors’ current position, where they want to be, high-opportunity markets, and next steps. The purpose of this plan is to grow their global presence over the next 10 years.
The Berry Export Summary 2028 is a comprehensive map that outlines the current position and future prospects of the Australian strawberry, raspberry and blackberry industries. It is a valuable resource for growers, exporters and other stakeholders in the berry industry to understand the opportunities for growth and development in the global market.

The purpose of the Berry Export Summary 2028 is to provide a clear and concise overview of the industry's current and future landscape, including:

- High-income countries with


In [None]:
#Here I get ROUGE to calculate another metric

import evaluate

rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
#Tokenize the sentences in the right way for ROUGE score

prediction = nltk.sent_tokenize(infer)
reference = nltk.sent_tokenize(answer)
print(prediction, reference)

['The Berry Export Summary 2028 is a comprehensive map that outlines the current position and future prospects of the Australian strawberry, raspberry and blackberry industries.', 'It is a valuable resource for growers, exporters and other stakeholders in the berry industry to understand the opportunities for growth and development in the global market.', "The purpose of the Berry Export Summary 2028 is to provide a clear and concise overview of the industry's current and future landscape, including:\n\n- High-income countries with"] ['The Berry Export Summary 2028 is a dedicated export plan for the Australian strawberry, raspberry, and blackberry industries.', 'It maps the sectors’ current position, where they want to be, high-opportunity markets, and next steps.', 'The purpose of this plan is to grow their global presence over the next 10 years.']


In [None]:
#Get the ROUGE score, which looks like 37% if we look at ROUGE-1

results = rouge.compute(predictions=prediction,
                       references=reference)

print(results)

{'rouge1': 0.366549213060841, 'rouge2': 0.24163357912648895, 'rougeL': 0.366549213060841, 'rougeLsum': 0.366549213060841}


# Conclusions

BASED ON BLEU score of 19% and ROUGE score of 37%, it looks like the inference is similar to the original answer to an extent.

However, it is also clear that this metric is not very helpful in this case, because if we evaluate the results as a human, the inference is actually informative and just as helpful as the original answer, even though the words used are not the same.

We will need to come up with better ways to evaluate generated text in the future.

As next step, we can also calculate BLEU and ROUGE for the whole RAG dataset to get an average number. We can also try finetuning Gemma on this dataset, and see if it performs better after training.

# Finetuning

In this section, I try use LORA and Supervised Fine Tunning (SFFT) to finetune the Gemma model with the RAG dataset.

I formatted the prompt to be appropriate for training, and used a LORA config, however, I haven't been able to succesfully run the trainer. The error is CUDA out of memory.

My next step to be to adjust the LORA Config and also the SFFT config, as well as the size of the training dataset so training can happen.

In [None]:
!pip install -q -U peft
!pip install -q -U trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [None]:
#Prompt formatting for training

def generate_prompt(data_point):
    prefix_text = 'Below is a context for the question. Give an answer that ' \
               'appropriately answers the question.\n\n'
    # Samples with additional context into.
    if data_point['context']:
        text = f"""<start_of_turn>user {prefix_text} {data_point["context"]} here is the question {data_point["question"]} <end_of_turn>\n<start_of_turn>model{data_point["answer"]} <end_of_turn>"""
    # Without
    else:
        text = f"""<start_of_turn>user {prefix_text} {data_point["question"]} <end_of_turn>\n<start_of_turn>model{data_point["answer"]} <end_of_turn>"""
    return text

# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in datasets['train']]
datasets['train'] = datasets['train'].add_column("prompt", text_column)

In [None]:
#Adjust the dataset to include a prompt and text column

datasets['train'] = datasets['train'].map(lambda samples: tokenizer(samples["prompt"]), batched=True)
datasets['train'] = datasets['train'].add_column("text", text_column)
datasets['train'] = datasets['train'].map(lambda samples: tokenizer(samples["text"]), batched=True)

Map:   0%|          | 0/9600 [00:00<?, ? examples/s]

In [None]:
dataset = datasets["train"]

print(dataset)

Dataset({
    features: ['context', 'question', 'answer', 'prompt', 'input_ids', 'attention_mask', 'text'],
    num_rows: 9600
})


In [None]:
#Split the dataset to smaller train and test datasets

dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import transformers

tokenizer.pad_token = tokenizer.eos_token

In [None]:
#The training function

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    peft_config=lora_config)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs


OutOfMemoryError: CUDA out of memory. Tried to allocate 42.00 MiB. GPU 

In [None]:
#Saving model

new_model = "gemma-Finetune"
trainer.model.save_pretrained(new_model)

In [None]:
# Merge the model with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Push the model and tokenizer to the Hugging Face Model Hub
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

In [None]:
#Try the new model on the same question and context

question = datasets['train'][0]['question']
context = datasets['train'][0]['context']

model = merged_model

infer = inference(question=question, context=context)

print(infer)

# Document Loading and Splitting

This section is for loading and tokenizing the document using Langchain.

In [5]:
!pip install pypdf
!pip install tiktoken
!pip install -qU langchain
!pip install -U langchain-community

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.2/310.2 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.4/124.4 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [6]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [7]:
#Create folder data and put the pdf document in that folder

from langchain.document_loaders import PyPDFLoader

loaders = [
    PyPDFLoader("/content/drive/My Drive/data/ResearchProposal.pdf")
]
pages = []
for loader in loaders:
    pages.extend(loader.load())

In [8]:
#Split text after loading document

from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=12)
docs = text_splitter.split_documents(pages)

In [9]:
print(docs[0].page_content)

Research
proposal
Master
of
Research
-
Business
Analytics
-
S2
2024
Prepared
by
Taylor
Chu
-
LinkedIn
-
hathanh.chu@students.mq.edu.au
AREA
OF
RESEARCH
-
RESPONSIBLE
AI
Since
the
beginning
of
the
internet
and
especially
over
the
recent
pandemic,
we
have
evolved
to
live
much
of
our
lives
online,
and


# Embeddings and Vector Store

In [10]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.0


In [11]:
#Import HuggingFace Embeddings using sentence transformers

import numpy as np
from langchain_community.embeddings import (
    HuggingFaceEmbeddings
)
encoder = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L12-v2', model_kwargs = {'device': "cpu"})


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
#Example of how embeddings result look, and taking dot product of two embeddings

embeddings1 = encoder.embed_query("Document intelligence")
embeddings2 = encoder.embed_query(docs[0].page_content)
print(np.dot(embeddings1, embeddings2))

0.35274698992279585


In [13]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0


In [14]:
#Using FAISS with distance strategy Dot product to calculate similarity between query and document context pieces

from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
faiss_db = FAISS.from_documents(docs, encoder, distance_strategy=DistanceStrategy.DOT_PRODUCT)

In [15]:
#Asked question find the 5 most similar context pieces (defined by k = 5)

question = "What is the research topic?"
retrieved_docs = faiss_db.similarity_search(question, k=5)
context = "".join(doc.page_content + "\n" for doc in retrieved_docs)
print(context)

and
for
topic
modeling,
I
will
use
a
State
of
the
Art
(SOTA)
model,
the
BERTopic
model.
As
I
read
the
literature,
I
will
identify
areas
of
potential
tension,
conﬂict
or
alignment
between
government,
industry
and
research,
and
attempt
to
verify
these
themes
through
analyzing
the
Australian
government’s
public

text
more
quickly,
and
ﬁnd
patterns
that
may
be
missed
by
a
human
reader
and
analyst.
As
this
area
of
research
continues
to
grow
and
expand,
the
method
can
be
utilized
to
help
the
government
more
quickly
assess
the
submissions
they
receive
from
public
consultations,
or
also
help
other
industry
and
research
institutions

as
the
510
submissions
from
the
public.
I
will
use
techniques
like
topic
modeling
and
sentiment
analysis
to
extract
the
areas
of
focus
in
each
submission.
As
the
submissions
have
diﬀerent
formatting
and
structure,
but
still
mostly
attempt
to
address
the
20
questions
put
forth
by
the
government’s
discussion
paper,
I
will
�s
discussion
paper,
I
will
need
to
extract
t

In [16]:
torch.cuda.empty_cache()

#Do inference using Gemma on the question and chosen pieces of context. The answer is pretty good in this case.

print(inference(question=question, context=context))

The research topic is the analysis of the Australian government's Safe and Responsible AI discussion paper and the 510 submissions from the public.


In [17]:
#Quote where the info was taken from

print("For this answer I used the following documents:")
for doc in retrieved_docs:
    print(doc.metadata)

For this answer I used the following documents:
{'source': '/content/drive/My Drive/data/ResearchProposal.pdf', 'page': 2}
{'source': '/content/drive/My Drive/data/ResearchProposal.pdf', 'page': 3}
{'source': '/content/drive/My Drive/data/ResearchProposal.pdf', 'page': 2}
{'source': '/content/drive/My Drive/data/ResearchProposal.pdf', 'page': 2}
{'source': '/content/drive/My Drive/data/ResearchProposal.pdf', 'page': 2}
