In [2]:
!pip install transformers peft accelerate bitsandbytes
!pip install -q -U wikipedia-api



In [3]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import wikipediaapi

In [5]:
def define_device():
    torch_version = torch.__version__
    print(f'Python version: {torch_version}', end=" -- ")
    
    # Check if MPS (Multi-Process Service) device is available on MacOS
    if torch.backends.mps.is_available():
        # If MPS is available, print a message indicating its usage
        print("using MPS device on MacOS")
        # Define the device as MPS
        defined_device = torch.device("mps")
    else:
        # If MPS is not available, determine the device based on GPU availability
        defined_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print a message indicating the selected device
        print(f"using {defined_device}")

    # Return the defined device
    return defined_device
    

In [6]:
# Pre-compile the regular expression pattern for better performance
BRACES_PATTERN = re.compile(r'\{.*?\}|\}')

def remove_braces_and_content(text):
    """Remove all occurrences of curly braces and their content from the given text"""
    return BRACES_PATTERN.sub('', text)

def clean_string(input_string):
    """Clean the input string."""
    
    # Remove extra spaces by splitting the string by spaces and joining back together
    cleaned_string = ' '.join(input_string.split())
    
    # Remove consecutive carriage return characters until there are no more consecutive occurrences
    cleaned_string = re.sub(r'\r+', '\r', cleaned_string)
    
    # Remove all occurrences of curly braces and their content from the cleaned string
    cleaned_string = remove_braces_and_content(cleaned_string)
    
    # Return the cleaned string
    return cleaned_string

In [9]:
def get_wikipedia_pages(categories):
    """Retrieve Wikipedia pages from a list of categories and extract their content"""
    
    # Create a Wikipedia object
    wiki_wiki = wikipediaapi.Wikipedia('Gemma AI Assistant (gemma@example.com)', 'sw')
    
    # Initialize lists to store explored categories and Wikipedia pages
    explored_categories = []
    wikipedia_pages = []

    # Iterate through each category
    print("- Processing Wikipedia categories:")
    for category_name in categories:
        print(f"\tExploring {category_name} on Wikipedia")
        
        # Get the Wikipedia page corresponding to the category
        category = wiki_wiki.page("Category:" + category_name)
        
        # Extract Wikipedia pages from the category and extend the list
        wikipedia_pages.extend(extract_wikipedia_pages(wiki_wiki, category_name))
        
        # Add the explored category to the list
        explored_categories.append(category_name)

    # Extract subcategories and remove duplicate categories
    categories_to_explore = [item.replace("Category:", "") for item in wikipedia_pages if "Category:" in item]
    wikipedia_pages = list(set([item for item in wikipedia_pages if "Category:" not in item]))
    
    # Explore subcategories recursively
    while categories_to_explore:
        category_name = categories_to_explore.pop()
        print(f"\tExploring {category_name} on Wikipedia")
        
        # Extract more references from the subcategory
        more_refs = extract_wikipedia_pages(wiki_wiki, category_name)

        # Iterate through the references
        for ref in more_refs:
            # Check if the reference is a category
            if "Category:" in ref:
                new_category = ref.replace("Category:", "")
                # Add the new category to the explored categories list
                if new_category not in explored_categories:
                    explored_categories.append(new_category)
            else:
                # Add the reference to the Wikipedia pages list
                if ref not in wikipedia_pages:
                    wikipedia_pages.append(ref)

    # Initialize a list to store extracted texts
    extracted_texts = []
    
    # Iterate through each Wikipedia page
    print("- Processing Wikipedia pages:")
    for page_title in tqdm(wikipedia_pages):
        try:
            # Make a request to the Wikipedia page
            page = wiki_wiki.page(page_title)

            # Check if the page summary does not contain certain keywords
            if "Biden" not in page.summary and "Trump" not in page.summary:
                # Append the page title and summary to the extracted texts list
                if len(page.summary) > len(page.title):
                    extracted_texts.append(page.title + " : " + clean_string(page.summary))

                # Iterate through the sections in the page
                for section in page.sections:
                    # Append the page title and section text to the extracted texts list
                    if len(section.text) > len(page.title):
                        extracted_texts.append(page.title + " : " + clean_string(section.text))
                        
        except Exception as e:
            print(f"Error processing page {page.title}: {e}")
                    
    # Return the extracted texts
    return extracted_texts

In [10]:
def extract_wikipedia_pages(wiki_wiki, category_name):
    """Extract all references from a category on Wikipedia"""
    
    # Get the Wikipedia page corresponding to the provided category name
    category = wiki_wiki.page("Category:" + category_name)
    
    # Initialize an empty list to store page titles
    pages = []
    
    # Check if the category exists
    if category.exists():
        # Iterate through each article in the category and append its title to the list
        for article in category.categorymembers.values():
            pages.append(article.title)
    
    # Return the list of page titles
    return pages

In [11]:
categories = ["Kilimo", "Mazao", "Magonjwa ya mimea", "Mbolea", "Mkulima", "Shamba" ]
# categories = ["Taasisi ya kuzuia na kupambana na rushwa" ]
extracted_texts = get_wikipedia_pages(categories)
print("Found", len(extracted_texts), "Wikipedia pages")

- Processing Wikipedia categories:
	Exploring Kilimo on Wikipedia
	Exploring Mazao on Wikipedia
	Exploring Magonjwa ya mimea on Wikipedia
	Exploring Mbolea on Wikipedia
	Exploring Mkulima on Wikipedia
	Exploring Shamba on Wikipedia
- Processing Wikipedia pages:


100%|██████████| 54/54 [00:04<00:00, 12.83it/s]

Found 121 Wikipedia pages





In [12]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

model = AutoPeftModelForCausalLM.from_pretrained(
    "Mollel/Gemma_Swahili_Mollel_1_epoch",
)

tokenizer = AutoTokenizer.from_pretrained("Mollel/Gemma_Swahili_Mollel_1_epoch")

adapter_config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.57G [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/200M [00:00<?, ?B/s]

In [13]:
input_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [15]:
question = "Andika aya fupi kuhusu mada iliyotolewa: Jinsi ya kulinda mazao shambani yasiharibiwe na wadudu"
input_text = input_prompt.format(
        {question}, # instruction
        "", # input
        "", # output - leave this blank for generation!
    )

inputs = tokenizer([input_text], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 300, use_cache = True)
response = tokenizer.batch_decode(outputs)[0]
response

"<bos>\n### Instruction:\n{'Andika aya fupi kuhusu mada iliyotolewa: Jinsi ya kulinda mazao shambani yasiharibiwe na wadudu'}\n\n### Input:\n\n\n### Response:\nKuna njia kadhaa za kulinda mazao shambani dhidi ya wadudu. Njia moja ni kutumia mbinu za kilimo endelevu, kama vile kupanda mazao mengi, kugeuza udongo, na kutumia mbolea za kikaboni, ambazo zote zinaweza kusaidia kudhibiti idadi ya wadudu. Njia nyingine ni kutumia njia za asili za kudhibiti wadudu, kama vile kutumia wadudu wa wadudu au kuunda makao ya wadudu wa wadudu. Pia kuna njia za kemikali, kama vile dawa za wadudu na dawa za kuua wadudu, ambazo zinaweza kutumiwa kuua wadudu. Ni muhimu kutumia njia hizi kwa uangalifu na kufuata maagizo ya matumizi, kwani zinaweza kuwa na athari mbaya kwa mazingira na afya ya binadamu.<eos>"

In [16]:
response

"<bos>\n### Instruction:\n{'Andika aya fupi kuhusu mada iliyotolewa: Jinsi ya kulinda mazao shambani yasiharibiwe na wadudu'}\n\n### Input:\n\n\n### Response:\nKuna njia kadhaa za kulinda mazao shambani dhidi ya wadudu. Njia moja ni kutumia mbinu za kilimo endelevu, kama vile kupanda mazao mengi, kugeuza udongo, na kutumia mbolea za kikaboni, ambazo zote zinaweza kusaidia kudhibiti idadi ya wadudu. Njia nyingine ni kutumia njia za asili za kudhibiti wadudu, kama vile kutumia wadudu wa wadudu au kuunda makao ya wadudu wa wadudu. Pia kuna njia za kemikali, kama vile dawa za wadudu na dawa za kuua wadudu, ambazo zinaweza kutumiwa kuua wadudu. Ni muhimu kutumia njia hizi kwa uangalifu na kufuata maagizo ya matumizi, kwani zinaweza kuwa na athari mbaya kwa mazingira na afya ya binadamu.<eos>"

In [18]:
def question_gemma(question, model=model, tokenizer=tokenizer, temperature=0.0, return_answer=False):
    input_text = input_prompt.format(
        {question}, # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
    
    
    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
    if temperature > 0:
        do_sample=True
    else:
        do_sample=False
    outputs = model.generate(**input_ids, 
                             max_new_tokens=256, 
                             do_sample=do_sample, 
                             temperature=temperature)
    result = str(tokenizer.decode(outputs[0])).replace("<bos>", "").replace("<eos>", "").strip()
    if return_answer:
        return result
    else:
        print(result)

In [20]:
question_gemma("Nitawezaje kuotesha mahindi bila kuharibiwa na wadudu?")

### Instruction:
{'Nitawezaje kuotesha mahindi bila kuharibiwa na wadudu?'}

### Input:


### Response:
Kuna njia kadhaa za kuzuia wadudu kwenye mahindi bila kuharibu mazao. Hapa kuna vidokezo kadhaa: 1. Ufugaji wa wadudu: Njia moja bora ya kuzuia wadudu kwenye mahindi ni kwa kuunda wadudu. Hii inahusisha kuunda wadudu ambao huvuta wadudu na kuwafanya waishi kwa muda mfupi. 2. Ufugaji wa wadudu: Njia nyingine ya kuzuia wadudu ni kwa kuunda wadudu ambao huvuta wadudu na kuwafanya waishi kwa muda mfupi. 3. Ufugaji wa wadudu: Njia nyingine ya kuzuia wadudu ni kwa kuunda wadudu ambao huvuta wadudu na kuwafanya waishi kwa muda mfupi. 4. Ufugaji wa wadudu: Njia nyingine ya kuzuia wadudu ni kwa kuunda wadudu ambao huvuta wadudu na kuwafanya waishi kwa muda mfupi. 5. Ufugaji wa wadudu: Njia nyingine ya ku


In [21]:
qa_data = []

def extract_json(text, word):
    pattern = fr'"{word}": "(.*?)"'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return ""

no_extracted_texts = 100 # increment this number up to len(extracted_texts)
question_ratio = 24 # decrement this number to produce more questions (suggested: 24)

for i in tqdm(range(len(extracted_texts[:no_extracted_texts]))):

    
    question_text = f"""Tengeneza swali na jibu lake kutokana na taarifa ifuatayo, ingiza 
    taarifa zote muhimu kwenye swali husika (usidhani msomaji anajua maandishi hayo au kitu chochote), pia jibu liwe lina eleweka na urudishe kwa 
    mfumo wa JSON pekee {'{"swali": "...", "jibu": "..."}'}
    
    
    Hii hapa ni taarifa ya kufafanua:
    {extracted_texts[i]}

    OUTPUT JSON:
    """

    no_questions = min(1, len(extracted_texts[i]) // question_ratio)
    
    for j in range(no_questions):
    
        result = question_gemma(question_text, model=model, temperature=0.9, return_answer=True)
        result = result.split("OUTPUT JSON:")[-1]

        question = extract_json(result, "swali")
        answer = extract_json(result, "jibu")
        

        qa_data.append(f"{question}\n{answer} \n")


100%|██████████| 100/100 [33:48<00:00, 20.28s/it]


In [22]:
qa_data

['Swali: Ni nini jina la chombo cha kupepetea cha nafaka cha jadi kinachotengenezwa kwa mbao? Jibu: Ungo. Kwa maneno mengine, kaa au jina la chombo cha kupanda cha kibinafsi cha kaa.\n \n',
 '\n \n',
 'Nini ni Pisum sativum?\n \n',
 'Ni aina gani ya maapulo yanayotumiwa na wanadamu na wanaweza kutumiwaje?\nTumbaku (pia inajulikana kama tumbako) ni majani ya mvua yanayotumiwa na wanadamu kwa kuvuta sigara, kutafuna mdomoni, au kunuswa. Tumbako huwa na nikotini, kemikali ya dawa ya kulevya ambayo mwili hujitayarisha haraka sana, na kumfanya iwe ngumu kupinga tabia ya kuvuta sigara.  \n',
 'Ni mnyama gani aliyeokoa wakazi wa mji kutoka kwa mto wa pango?\nMchwa \n',
 'Ni nini kilichofanya tumbaku iwe maarufu sana?\nNi dawa hasa nikotini ndani yake ambayo inathiri ubongo na neva za binadamu, na kusababisha watu kuhisi utulivu na raha. \n',
 '\n \n',
 'Ni nchi gani zinazolima tumbaku zaidi ulimwenguni?\nChina (39.6%), India (8.3%), Brazil (7.0%) na Marekani (4.6%). \n',
 'Ni siku ngapi baada

In [25]:
extracted_texts[11]

'Kilimo : Kilimo ni mojawapo ya shughuli za mwanadamu ambayo humsaidia kupata mazao kwa ajili ya chakula na mazao mengine kwa ajili ya biashara ambayo inaweza kumuingizia kipato kwa ajili ya maisha yake, pia kipato hicho husaidia katika kukuza uchumi wa nchi yake ili hatimaye itakuwa imejikomboa kiuchumi. Sekta hiyo ndiyo kiinua uchumi kikubwa kwa mataifa mengi. Kwa jumla serikali zinatakiwa zitoe kipaumbele kwa sekta ya kilimo ili kuepukana na janga la njaa: mojawapo ya mahitaji makuu ya mwanadamu ni chakula. Serikali zinatakiwa zizidi kuongeza zana za kilimo ili watu waweze kupata kwa wingi mazao ya biashara na mazao ya chakula. Pia serikali itoe elimu juu ya kilimo bora kama kitega uchumi ambacho huwasaidia watu kujipatia mazao na kipato cha kwao ili kuendesha maisha yao binafsi. Wakulima wanatakiwa kupewa mikopo ya pesa, na pia kujishughulisha na kilimo cha umwagiliaji, si tu kutegemea mvua.'

In [30]:
extracted_data = pd.DataFrame(data=extracted_texts)

In [31]:
extracted_data.head()

Unnamed: 0,0
0,Ungo : Ungo ni chombo cha kupepetea vitu hasa ...
1,Zao la biashara : Zao la biashara ni chochote ...
2,Njegere ya kizungu : Njegere za kizungu (jina ...
3,Tumbaku : Tumbaku (pia tumbako) ni majani (min...
4,Tumbaku : Asili ya tumbaku ni Amerika ambako w...


In [32]:
extracted_data.to_csv('/kaggle/working/extracted_data.csv')

In [35]:
!pip install ipywidgets=7.7.1
!pip install huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: Invalid requirement: 'ipywidgets=7.7.1'
Hint: = is not a valid operator. Did you mean == ?[0m[31m
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [36]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
import datasets
from datasets import Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("nsoma/swahili-llm-dataset-0")

dataset.push_to_hub()