In [1]:
import pandas as pd
from string import digits, punctuation
from unidecode import unidecode
import re
from collections import Counter


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def read_file(file_path):
    with open(file_path, "r") as file:
        f = file.read()
    f = f.split("\n")
    return f

In [3]:
eng = read_file("../data/en-kikuyu/eng.txt")

In [4]:
kikuyu = read_file("../data/en-kikuyu/kikuyu.txt")

In [5]:
en_kikuyu = pd.DataFrame({'kikuyu': kikuyu,
              'eng': eng})

In [6]:
en_kikuyu

Unnamed: 0,kikuyu,eng
0,Marĩĩtwa mamwe thĩinĩ wa gĩcunjĩ gĩkĩ nĩ macen...,Some names in this article have been changed .
1,Jehova nĩrĩo rĩĩtwa rĩa Ngai ta ũrĩa rĩguũrĩti...,Jehovah is the name of God as revealed in the ...
2,Bibilia yugaga ũũ : “ Ngoro ya mwana ĩthaĩkanĩ...,The Bible says that “ foolishness is bound up ...
3,© 2016 Watch Tower Bible and Tract Society of ...,© 2016 Watch Tower Bible and Tract Society of ...
4,( Rora rungu rwa MORUTANI MA BIBILIA > MACOKIO...,( Look under BIBLE TEACHINGS > BIBLE QUESTIONS...
...,...,...
2694,Ngũbia ya ũhonokio ( Rora kĩbungo gĩa 15 - 18 ),The helmet of salvation ( See paragraphs 15 - ...
2695,Nĩ nyonete atĩ andũ nĩ mathikagĩrĩria wega man...,I’ve found that people respond well when they ...
2696,Rũhiũ rwa roho ( Rora kĩbungo gĩa 19 - 20 ),The sword of the spirit ( See paragraphs 19 - ...
2697,"Kũgerera ũteithio wa Jehova , no tũhote kwĩhan...","With Jehovah’s help , we can stand firm agains..."


In [7]:
def clean_text(x):
    remove_digits = str.maketrans('', '', digits)
    x = x.translate(remove_digits).lower()
    x = re.sub(r'(www|http|https)\S+', '', x)
    x = unidecode(x)
    x = x.translate(str.maketrans('', '', punctuation))
    return x

In [8]:
en_kikuyu["kikuyu"] = en_kikuyu.kikuyu.apply(clean_text)
en_kikuyu["eng"] = en_kikuyu.eng.apply(clean_text)

In [9]:
en_kikuyu

Unnamed: 0,kikuyu,eng
0,mariitwa mamwe thiini wa gicunji giki ni macen...,some names in this article have been changed
1,jehova nirio riitwa ria ngai ta uria riguuriti...,jehovah is the name of god as revealed in the ...
2,bibilia yugaga uu ngoro ya mwana ithaikaniti...,the bible says that foolishness is bound up i...
3,c watch tower bible and tract society of penn...,c watch tower bible and tract society of penn...
4,rora rungu rwa morutani ma bibilia macokio m...,look under bible teachings bible questions a...
...,...,...
2694,ngubia ya uhonokio rora kibungo gia,the helmet of salvation see paragraphs
2695,ni nyonete ati andu ni mathikagiriria wega man...,ive found that people respond well when they s...
2696,ruhiu rwa roho rora kibungo gia,the sword of the spirit see paragraphs
2697,kugerera uteithio wa jehova no tuhote kwihand...,with jehovahs help we can stand firm against ...


In [10]:
eng_ = read_file("../data/eng-swa/gamayun_kit5k.eng")
swahili = read_file("../data/eng-swa/gamayun_kit5k.swa")

In [11]:
en_swahili = pd.DataFrame({"eng": eng_,
                           "swahili": swahili})
en_swahili

Unnamed: 0,eng,swahili
0,Is that your new friend?,Huyo ni rafiki yako mpya?
1,Jacob wasn't interested in baseball.,Job hana hamu ya mpira wa vikapu.
2,Adam told me that Alice had a new boyfriend.,Adam aliniambia kuwa Alice alikuwa na mpenzi m...
3,The radio didn't inform about the accident.,Radio haikutanga kuhusu ajali hiyo.
4,Adam is worried we'll get lost.,Adamu ana wasiwasi tutapotea.
...,...,...
4996,"C'mon, let's eat something!","Njoo, wacha tule kitu!"
4997,The essence of all art is to have pleasure in ...,Kiini cha sanaa yote ni kuwa na furaha katika ...
4998,"As I entered the coffee bar, I found two young...","Nilipoingia kwenye mkahawa, nilipata vijana wa..."
4999,"When you swallow a dangerous substance, what y...","Unapomeza dutu hatari, kile unahitaji kufanya ..."


In [12]:
en_swahili["eng"] = en_swahili.eng.apply(clean_text)
en_swahili["swahili"] =en_swahili.swahili.apply(clean_text)
en_swahili

Unnamed: 0,eng,swahili
0,is that your new friend,huyo ni rafiki yako mpya
1,jacob wasnt interested in baseball,job hana hamu ya mpira wa vikapu
2,adam told me that alice had a new boyfriend,adam aliniambia kuwa alice alikuwa na mpenzi m...
3,the radio didnt inform about the accident,radio haikutanga kuhusu ajali hiyo
4,adam is worried well get lost,adamu ana wasiwasi tutapotea
...,...,...
4996,cmon lets eat something,njoo wacha tule kitu
4997,the essence of all art is to have pleasure in ...,kiini cha sanaa yote ni kuwa na furaha katika ...
4998,as i entered the coffee bar i found two young ...,nilipoingia kwenye mkahawa nilipata vijana waw...
4999,when you swallow a dangerous substance what yo...,unapomeza dutu hatari kile unahitaji kufanya i...


In [13]:
en_lug = pd.read_csv("../data/eng-lug.csv", encoding='latin-1').iloc[:,:2]
en_lug

Unnamed: 0,English,Luganda
0,All refugees were requested to register with t...,Abanoonyiboobubudamu bonna baasabiddwa beewand...
1,They called for a refugees' meeting yesterday.,Baayise olukungaana lw'abanoonyiboobubudamu eg...
2,Refugees had misunderstandings between thems...,Abanoonyiboobubudamu b'abadde n'obutakkaanya w...
3,We were urged to welcome refugees into our com...,Twakubirizibwa okwaniriza abanoonyiboobubudamu...
4,More development is achieved when we work toge...,Bwe tukolera awamu enkulaakulana enyingi efuni...
...,...,...
15995,,
15996,,
15997,,
15998,,


In [14]:
en_lug.dropna(inplace=True)
en_lug["English"] = en_lug.English.apply(clean_text)
en_lug["Luganda"] = en_lug.Luganda.apply(clean_text)
en_lug

Unnamed: 0,English,Luganda
0,all refugees were requested to register with t...,abanoonyiboobubudamu bonna baasabiddwa beewand...
1,they called for a refugees meeting yesterday,baayise olukungaana lwabanoonyiboobubudamu eggulo
2,refugees had misunderstandings between thems...,abanoonyiboobubudamu babadde nobutakkaanya wak...
3,we were urged to welcome refugees into our com...,twakubirizibwa okwaniriza abanoonyiboobubudamu...
4,more development is achieved when we work toge...,bwe tukolera awamu enkulaakulana enyingi efunibwa
...,...,...
15017,senior officials exaggerated the numbers to mi...,abakungu aboku ntikko baayongeza omuwendo okwe...
15018,there is a high risk in the spread of diseases...,akatyabaga kokusaasaana kwendwadde mu nkambi k...
15019,they were relocated to another settlement camp,baasengulwa ne batwalibwa mu nkambi endala
15020,some people who flee from south sudan already ...,abantu abamu abava mu south sudan balina abeng...


In [15]:
strings = " ".join(en_kikuyu.kikuyu) + " ".join(en_swahili.swahili) + " ".join(en_lug.Luganda)
tokens = Counter(strings.split())

In [16]:
len(tokens)

32810

In [17]:
keys_with_value_1 = [key for key, value in tokens.items() if value == 1]
new_tokens = {key: value for key, value in tokens.items() if value != 1}

In [18]:
len(keys_with_value_1)

20578

In [19]:
# tokens = "\n".join(list(new_tokens.keys()))

In [36]:
# with open("../data/processed/tokens.txt", "w") as file:
    # file.write(tokens)

In [77]:
# file

In [20]:
en_kikuyu

Unnamed: 0,kikuyu,eng
0,mariitwa mamwe thiini wa gicunji giki ni macen...,some names in this article have been changed
1,jehova nirio riitwa ria ngai ta uria riguuriti...,jehovah is the name of god as revealed in the ...
2,bibilia yugaga uu ngoro ya mwana ithaikaniti...,the bible says that foolishness is bound up i...
3,c watch tower bible and tract society of penn...,c watch tower bible and tract society of penn...
4,rora rungu rwa morutani ma bibilia macokio m...,look under bible teachings bible questions a...
...,...,...
2694,ngubia ya uhonokio rora kibungo gia,the helmet of salvation see paragraphs
2695,ni nyonete ati andu ni mathikagiriria wega man...,ive found that people respond well when they s...
2696,ruhiu rwa roho rora kibungo gia,the sword of the spirit see paragraphs
2697,kugerera uteithio wa jehova no tuhote kwihand...,with jehovahs help we can stand firm against ...


In [21]:
processed_data = pd.DataFrame(columns=["source", "target"])

In [22]:
processed_data["source"] = en_kikuyu.kikuyu.apply(lambda x: "translate kikuyu to english: "  + x)
processed_data["target"] = en_kikuyu.eng

processed_data["source"] = en_kikuyu.eng.apply(lambda x: "translate english to kikuyu: "  + x)
processed_data["target"] = en_kikuyu.kikuyu

In [23]:
new_rows = pd.DataFrame({"source": en_lug.English.apply(lambda x: "translate english to luganda: " + x).values,
                         "target": en_lug.Luganda.values})

processed_data = processed_data._append(new_rows, ignore_index=True)


new_rows = pd.DataFrame({"source": en_lug.Luganda.apply(lambda x: "translate luganda to english: " + x).values,
                         "target": en_lug.English.values})

processed_data = processed_data._append(new_rows, ignore_index=True)

In [24]:
new_rows = pd.DataFrame({"source": en_swahili.eng.apply(lambda x: "translate english to swahili: "+ x).values,
                         "target": en_swahili.swahili.values})

processed_data = processed_data._append(new_rows, ignore_index=True)

new_rows = pd.DataFrame({"source": en_swahili.swahili.apply(lambda x: "translate swahili to english: " + x).values,
                         "target": en_swahili.eng.values})

processed_data = processed_data._append(new_rows, ignore_index=True)


In [25]:
len(processed_data)

42743

In [26]:
processed_data

Unnamed: 0,source,target
0,translate english to kikuyu: some names in thi...,mariitwa mamwe thiini wa gicunji giki ni macen...
1,translate english to kikuyu: jehovah is the na...,jehova nirio riitwa ria ngai ta uria riguuriti...
2,translate english to kikuyu: the bible says th...,bibilia yugaga uu ngoro ya mwana ithaikaniti...
3,translate english to kikuyu: c watch tower bi...,c watch tower bible and tract society of penn...
4,translate english to kikuyu: look under bible...,rora rungu rwa morutani ma bibilia macokio m...
...,...,...
42738,translate swahili to english: njoo wacha tule ...,cmon lets eat something
42739,translate swahili to english: kiini cha sanaa ...,the essence of all art is to have pleasure in ...
42740,translate swahili to english: nilipoingia kwen...,as i entered the coffee bar i found two young ...
42741,translate swahili to english: unapomeza dutu h...,when you swallow a dangerous substance what yo...


In [27]:
# processed_data.to_csv("../data/processed/processed_data.csv")

In [53]:
# with open("../data/processed/tokens.txt", "r") as file:
#     f = file.read()

In [55]:
f.split("\n")

['mariitwa',
 'mamwe',
 'thiini',
 'wa',
 'gicunji',
 'giki',
 'ni',
 'macenjetio',
 'jehova',
 'nirio',
 'riitwa',
 'ria',
 'ngai',
 'ta',
 'uria',
 'bibilia',
 'yugaga',
 'uu',
 'ngoro',
 'ya',
 'mwana',
 'na',
 'c',
 'watch',
 'tower',
 'bible',
 'and',
 'tract',
 'society',
 'of',
 'pennsylvania',
 'rora',
 'rungu',
 'rwa',
 'morutani',
 'ma',
 'macokio',
 'ciuria',
 'cia',
 'iri',
 'kia',
 'wira',
 'thi',
 'yothe',
 'kuruta',
 'andu',
 'mbere',
 'mihothi',
 'niguo',
 'no',
 'uhingure',
 'jworg',
 'ki',
 'o',
 'tiga',
 'haria',
 'undu',
 'ungi',
 'ri',
 'maandiko',
 'maria',
 'mothe',
 'moimite',
 'hari',
 'the',
 'in',
 'gikuyu',
 'new',
 'world',
 'translation',
 'mathayo',
 'kuguuririo',
 'uritu',
 'uri',
 'mundu',
 'utumaga',
 'indi',
 'wega',
 'gukena',
 'gutumaga',
 'thoma',
 'akorintho',
 'aatuire',
 'mawira',
 'mega',
 'indo',
 'tukwaririria',
 'uhoro',
 'uriku',
 'riri',
 'niki',
 'b',
 'iriku',
 'thimo',
 'maundu',
 'mariku',
 'tukaaririria',
 'kiria',
 'kirumiriire',
 'k

  from .autonotebook import tqdm as notebook_tqdm
