# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.simplefilter("ignore")

# Data Preparation

In [2]:
# Read JSONL file into a Pandas DataFrame

# URLs for the JSONL files
dev_url = "https://raw.githubusercontent.com/SunbirdAI/salt/main/v1.2/salt-dev-v1.2.jsonl"

test_url = "https://raw.githubusercontent.com/SunbirdAI/salt/main/v1.2/salt-test-v1.2.jsonl"

train_url = "https://raw.githubusercontent.com/SunbirdAI/salt/main/v1.2/salt-train-v1.2.jsonl"

# Function to read JSONL data from the URL and return a DataFrame
def read_jsonl(url):
    response = requests.get(url)
    data = response.text
    df = pd.read_json(data, lines=True)
    return df

# Read JSONL data into separate DataFrames
dev_data = read_jsonl(dev_url)  # DataFrame for dev data
test_data = read_jsonl(test_url)  # DataFrame for test data
train_data = read_jsonl(train_url)  # DataFrame for train data


In [3]:
train_data

Unnamed: 0,text,tts-speech
0,"{'eng': 'It was not a ghost refugee camp.', 'l...",
1,{'eng': 'I want to go to town over the weekend...,
2,"{'eng': 'I have high blood pressure.', 'lug': ...",
3,{'eng': 'You need to have priorities in life.'...,
4,{'eng': 'It's a good practice to help those in...,
...,...,...
23942,{'eng': 'It has all happened in the intervenin...,
23943,{'eng': 'Many people have recovered from coron...,
23944,{'eng': 'The government will provide support t...,
23945,{'eng': 'There are many things that we need to...,


In [4]:
# Access the values for each language in separate columns
def extract_language_values(df):
    df[['eng', 'lug', 'ach', 'teo', 'lgg', 'nyn']] = df['text'].apply(lambda x: pd.Series(x))

datasets = [dev_data, test_data, train_data]

# Apply the function on each DataFrame in the list
for df in datasets:
    extract_language_values(df)


In [5]:
train_data.head(7)

Unnamed: 0,text,tts-speech,eng,lug,ach,teo,lgg,nyn
0,"{'eng': 'It was not a ghost refugee camp.', 'l...",,It was not a ghost refugee camp.,Enkambi y'abanoonyiboobubudamu teyaliiwo mu bu...,Pe obedo kem goba goba,Mam arai ekabi lo erai ekwam.,Eri aa'ni ndra kembe emunyale eyini aa'zu inzo...,Bukaba butari butaaho bw'ekigingirire.
1,{'eng': 'I want to go to town over the weekend...,,I want to go to town over the weekend.,Njagala kugenda mu kibuga ku wiikendi.,Amito citi I taun I tum cabit.,Akoto eong alosit oibuga owiken.,Ale mu tawunia sabitini i deria ra,Ninyenda kuza omu tauni akakuhendera kwa wiiki.
2,"{'eng': 'I have high blood pressure.', 'lug': ...",,I have high blood pressure.,Nina puleesa eya waggulu.,Atye ki peko me two pressure,Aja eong keda epuresa.,Ma vu azo ari tuza niri ci,Omutima gwangye niguteerera ahaiguru.
3,{'eng': 'You need to have priorities in life.'...,,You need to have priorities in life.,Olina okubaako ebikulu by'okulembeza mu bulamu.,Mite ni ibed ki jami ma I mito timone mukwo I ...,Ibusakinit jo ajaut keda alosikineta nuka apol...,Le ma ma ovu e'yo,Noyetenga kugira ebiwareeba nka bikuru omu mag...
4,{'eng': 'It's a good practice to help those in...,,It's a good practice to help those in need.,Kikolwa kirungi okuyamba abo abali mu bwetaavu.,Obedo tic maber me konyo joo matye I peko,Ejok aingaranakin ngul lu icanas.,Eri 'yeta muke ni 'ba afa koko 'diyi ma aza kozu,N'ekikorwa kirungi okuhwera abaine ebyetengo.
5,"{'eng': 'What they did was very awful.', 'lug'...",,What they did was very awful.,Kye baakola kyali kibi nnyo.,Ngo ma gutimo obedo rac tutwal,Aroko noi nuapotu kesi kiswamata.,E'yo yini 'yeleri ndra onzi tu.,Eki baakozire kikaba kibi munonga.
6,{'eng': 'What are some of the challenges women...,{'lug': 'https://salt-tts-data.s3.eu-west-1.am...,What are some of the challenges women face in ...,Bisomooza ki abakyala bye basanga nga bagezaak...,Mono peki ango ma mon nongo ikare me nongo kon...,Anubo atiokisio acie etakanikinete angor kowai...,E'yo ndundu oku eyini esu ewaru azakoma fele o...,"Ni buremeezi ki obumwe, obu abakazi barikushan..."


In [6]:
# Eliminating the tts-speeach column
train_data = train_data.iloc[:, 2:]
train_data

Unnamed: 0,eng,lug,ach,teo,lgg,nyn
0,It was not a ghost refugee camp.,Enkambi y'abanoonyiboobubudamu teyaliiwo mu bu...,Pe obedo kem goba goba,Mam arai ekabi lo erai ekwam.,Eri aa'ni ndra kembe emunyale eyini aa'zu inzo...,Bukaba butari butaaho bw'ekigingirire.
1,I want to go to town over the weekend.,Njagala kugenda mu kibuga ku wiikendi.,Amito citi I taun I tum cabit.,Akoto eong alosit oibuga owiken.,Ale mu tawunia sabitini i deria ra,Ninyenda kuza omu tauni akakuhendera kwa wiiki.
2,I have high blood pressure.,Nina puleesa eya waggulu.,Atye ki peko me two pressure,Aja eong keda epuresa.,Ma vu azo ari tuza niri ci,Omutima gwangye niguteerera ahaiguru.
3,You need to have priorities in life.,Olina okubaako ebikulu by'okulembeza mu bulamu.,Mite ni ibed ki jami ma I mito timone mukwo I ...,Ibusakinit jo ajaut keda alosikineta nuka apol...,Le ma ma ovu e'yo,Noyetenga kugira ebiwareeba nka bikuru omu mag...
4,It's a good practice to help those in need.,Kikolwa kirungi okuyamba abo abali mu bwetaavu.,Obedo tic maber me konyo joo matye I peko,Ejok aingaranakin ngul lu icanas.,Eri 'yeta muke ni 'ba afa koko 'diyi ma aza kozu,N'ekikorwa kirungi okuhwera abaine ebyetengo.
...,...,...,...,...,...,...
23942,It has all happened in the intervening period,Byonna bibaddewo mu makkati w'ebiseera.,Magi weng otimme I kare me timo gin mo me laro...,Iswamauna ngun kere kotoma apak naka aingareni...,E'yo 'di 'du pari sawa alu alea.,Byona bibaireho omu bwire bw'okurwanisa endwara
23943,Many people have recovered from coronavirus in...,Abantu bangi mu Uganda baasuuka obulwadde bwa ...,Jo mapol gucang ki ki two korona I lobo Uganda.,Apotu itunga luipu kongaleutu kotoma adeka nak...,Ba karakarau ati engazu azo coronavirusniri vu...,Abantu bingi baakakiira akakoko ka coorona om...
23944,The government will provide support to the eld...,Gavumenti ejja kuwa abakadde obuyambi.,Gamente obi miyo kom ki joo ma otegi,Elosi apugan aijaikin agangat nejaas itunga lu...,Gamete ni mu atita fe 'ba 'wara eyidri,Gavumenti ereija kuheereza abantu abakuzire ob...
23945,There are many things that we need to know bef...,Waliwo ebintu bingi bye twetaaga okumanya nga ...,Tye jami mapol mamyero wange mapud pe wanyomme.,Ipu iboro luibusakinit oni aijen eringa oni el...,Afa leepi ama ma ni yi de nga afi ni aje aku ...,Heine ebiintu bingyi byoyine kumanya otakashwe...


In [7]:
test_data = test_data.iloc[:, 2:]
# test_data


In [8]:
train_data.shape, test_data.shape, dev_data.shape

((23947, 6), (500, 6), (500, 8))

In [9]:
# Preprocess the data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    return text

for col in train_data.columns:
    test_data[col] = test_data[col].apply(preprocess_text)
    train_data[col] = train_data[col].apply(preprocess_text)

In [10]:
train_data.head(5)

Unnamed: 0,eng,lug,ach,teo,lgg,nyn
0,it was not a ghost refugee camp.,enkambi y'abanoonyiboobubudamu teyaliiwo mu bu...,pe obedo kem goba goba,mam arai ekabi lo erai ekwam.,eri aa'ni ndra kembe emunyale eyini aa'zu inzo...,bukaba butari butaaho bw'ekigingirire.
1,i want to go to town over the weekend.,njagala kugenda mu kibuga ku wiikendi.,amito citi i taun i tum cabit.,akoto eong alosit oibuga owiken.,ale mu tawunia sabitini i deria ra,ninyenda kuza omu tauni akakuhendera kwa wiiki.
2,i have high blood pressure.,nina puleesa eya waggulu.,atye ki peko me two pressure,aja eong keda epuresa.,ma vu azo ari tuza niri ci,omutima gwangye niguteerera ahaiguru.
3,you need to have priorities in life.,olina okubaako ebikulu by'okulembeza mu bulamu.,mite ni ibed ki jami ma i mito timone mukwo i ...,ibusakinit jo ajaut keda alosikineta nuka apol...,le ma ma ovu e'yo,noyetenga kugira ebiwareeba nka bikuru omu mag...
4,it's a good practice to help those in need.,kikolwa kirungi okuyamba abo abali mu bwetaavu.,obedo tic maber me konyo joo matye i peko,ejok aingaranakin ngul lu icanas.,eri 'yeta muke ni 'ba afa koko 'diyi ma aza kozu,n'ekikorwa kirungi okuhwera abaine ebyetengo.


# Preprocessing


In [11]:
# Convert the DataFrame to the long format (melt)
train_data = train_data.melt(var_name='Language', value_name='Text')
train_data

Unnamed: 0,Language,Text
0,eng,it was not a ghost refugee camp.
1,eng,i want to go to town over the weekend.
2,eng,i have high blood pressure.
3,eng,you need to have priorities in life.
4,eng,it's a good practice to help those in need.
...,...,...
143677,nyn,byona bibaireho omu bwire bw'okurwanisa endwara
143678,nyn,abantu bingi baakakiira akakoko ka coorona om...
143679,nyn,gavumenti ereija kuheereza abantu abakuzire ob...
143680,nyn,heine ebiintu bingyi byoyine kumanya otakashwe...


# Retrieving independent variables and dependent variables

In [12]:
X = train_data["Text"]
y = train_data["Language"]


In [13]:
#Split the data for training and testing
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((100577,), (43105,))

## Feature Extraction using TF-IDF vectors

In [14]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model Development and Evaluation

## Support Vector Machine (SVM) model

In [15]:
# Build and train a Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf, y_train)

# Predict the language labels for the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9948033870780651
              precision    recall  f1-score   support

         ach       1.00      1.00      1.00      7069
         eng       1.00      1.00      1.00      7153
         lgg       1.00      1.00      1.00      7175
         lug       0.99      0.99      0.99      7323
         nyn       0.98      0.99      0.99      7166
         teo       1.00      1.00      1.00      7219

    accuracy                           0.99     43105
   macro avg       0.99      0.99      0.99     43105
weighted avg       0.99      0.99      0.99     43105



## Decision Tree classifier

In [16]:
# Build and train a Decision Tree classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)

# Predict the language labels for the test set
y_pred = dt_model.predict(X_test_tfidf)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9602830298109268
              precision    recall  f1-score   support

         ach       0.97      0.97      0.97      7069
         eng       1.00      0.99      1.00      7153
         lgg       0.98      0.97      0.98      7175
         lug       0.95      0.92      0.93      7323
         nyn       0.95      0.91      0.93      7166
         teo       0.92      1.00      0.96      7219

    accuracy                           0.96     43105
   macro avg       0.96      0.96      0.96     43105
weighted avg       0.96      0.96      0.96     43105



## Naive Bayes classifier

In [17]:
# Build and train a Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)

# Predict the language labels for the test set
y_pred = naive_bayes.predict(X_test_tfidf)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9969841085720914
              precision    recall  f1-score   support

         ach       0.99      1.00      1.00      7069
         eng       1.00      1.00      1.00      7153
         lgg       1.00      1.00      1.00      7175
         lug       0.99      0.99      0.99      7323
         nyn       1.00      0.99      0.99      7166
         teo       1.00      1.00      1.00      7219

    accuracy                           1.00     43105
   macro avg       1.00      1.00      1.00     43105
weighted avg       1.00      1.00      1.00     43105



## Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Build and train a Random Forest classifier
rfc = RandomForestClassifier()
rfc .fit(X_train_tfidf, y_train)

# Predict the language labels for the test set
y_pred = rfc.predict(X_test_tfidf)

# Evaluate the model's performance
print("Random Forest Model:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Random Forest Model:
Accuracy: 0.9794223407957313
              precision    recall  f1-score   support

         ach       0.99      0.99      0.99      7069
         eng       1.00      1.00      1.00      7153
         lgg       0.99      1.00      1.00      7175
         lug       0.98      0.95      0.96      7323
         nyn       0.97      0.95      0.96      7166
         teo       0.95      1.00      0.97      7219

    accuracy                           0.98     43105
   macro avg       0.98      0.98      0.98     43105
weighted avg       0.98      0.98      0.98     43105



## RNN (Recurrent Neural Networks):

In [19]:
# Create word-level tokenization using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['Text'])

# Convert text data to sequences
X_sequences = tokenizer.texts_to_sequences(train_data['Text'])

# Pad sequences to a fixed length
X_padded = pad_sequences(X_sequences)

unique_languages = ['eng', 'lug', 'ach', 'teo', 'lgg', 'nyn']

# Use label encoder to convert language labels to integer representations
label_encoder = LabelEncoder()
label_encoder.fit(unique_languages)

# Apply the label encoder to the training data
y_encoded = label_encoder.transform(train_data['Language'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Build and train an RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_padded.shape[1]))
model.add(LSTM(units=100))
model.add(Dense(units=len(train_data['Language'].unique()), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Predict the language labels probabilities for the test set
y_probs = model.predict(X_test)

# Get the class with the highest probability as the predicted label
y_pred = y_probs.argmax(axis=1)

# Inverse transform the integer-encoded labels back to language names
y_pred_labels = label_encoder.inverse_transform(y_pred)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
# Evaluate the model's performance on the test set
print("RNN Model (5 epochs):")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

RNN Model (5 epochs):
Accuracy: 0.9969377457633016
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4723
           1       1.00      1.00      1.00      4766
           2       1.00      1.00      1.00      4802
           3       0.99      0.99      0.99      4888
           4       0.99      0.99      0.99      4732
           5       1.00      1.00      1.00      4826

    accuracy                           1.00     28737
   macro avg       1.00      1.00      1.00     28737
weighted avg       1.00      1.00      1.00     28737



In [21]:

train_data['Language'].unique()

array(['eng', 'lug', 'ach', 'teo', 'lgg', 'nyn'], dtype=object)

In [22]:
y_encoded

array([1, 1, 1, ..., 4, 4, 4])

## Testing the Models

In [23]:
def identify_lang(word, vectorizer, models):
    # Vectorize the word using the TfidfVectorizer
    word_vector = vectorizer.transform([word])

    # Predict the language labels for the word using each model
    predicted_languages = {}
    for model_name, model in models.items():
        predicted_label = model.predict(word_vector)[0]
        predicted_languages[model_name] = predicted_label

    return predicted_languages

# Dictionary to store models
models = {
    'SVM': svm_model,
    'Decision Tree': dt_model,
    'Naive Bayes': naive_bayes,
    'Random Forest': rfc

}



In [24]:
word_to_identify= "Ebigambo by'amagezi" # lug

# Identify the language of the word using the different classifiers
predicted_languages = identify_lang(word_to_identify, tfidf_vectorizer, models)

# Print the results
for model_name, predicted_language in predicted_languages.items():
    print(f"{model_name} Predicted Language: {predicted_language}")

SVM Predicted Language: lug
Decision Tree Predicted Language: lug
Naive Bayes Predicted Language: lug
Random Forest Predicted Language: lug


## RNN

In [25]:

def identify_language(model, tokenizer, label_encoder, word):
    # Tokenize and preprocess the word using the tokenizer
    word_sequence = tokenizer.texts_to_sequences([word])
    word_padded = pad_sequences(word_sequence, maxlen=model.input_shape[1])

    # Predict the language probabilities for the word
    predicted_probs = model.predict(word_padded)

    # Get the class with the highest probability as the predicted label
    predicted_label_index = np.argmax(predicted_probs)

    # Inverse transform the integer-encoded label back to language name
    predicted_language = label_encoder.inverse_transform([predicted_label_index])[0]

    return predicted_language



In [26]:
word_to_identify="let's go to the katale"

# Identify the language of the word using the different classifiers
predicted_languages = identify_lang(word_to_identify, tfidf_vectorizer, models)

# Print the results
for model_name, predicted_language in predicted_languages.items():
    print(f"{model_name} Predicted Language: {predicted_language}")

SVM Predicted Language: eng
Decision Tree Predicted Language: eng
Naive Bayes Predicted Language: eng
Random Forest Predicted Language: eng


In [27]:
predicted_language = identify_language(model, tokenizer, label_encoder, "Mulindeeko tujja kubayita")

# Print the result
print("Predicted Language:", predicted_language)

Predicted Language: lug


### Saving the Models
Choosing the best perfoming models: SVM, Naive Bayes and RNN


In [28]:
# Mount the Google Drive onto the Colab environment.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
import joblib

# Save the SVM model to a file
joblib.dump(svm_model, 'svm_model.pkl')

# Save the Naive Bayes model to a file
joblib.dump(naive_bayes, 'nb_model.pkl')


['nb_model.pkl']

In [30]:
from keras.models import save_model

# Save the RNN model to a file
save_model(model, 'rnn_model.h5')


### Model Testing

In [31]:
new_samples = [
    'I am going to the katale', # eng
    'Ngenda mu katale',  # lug
    'Uganda',  # any
    'Ekikopo',  # lug or nyn
    'Runyankole',  # nyn or lug
    'Omuntu',  # nyn or lug
    "Ebigambo by'amagezi",  # lug
    "Bw'okoowa",  # lug
    "yiga",  # lug
    "kubivaako",  # lug
    "Mulindeeko tujja kubayita",  # lug
    "Gamba Mpurire",  # nyn
    "Ekiro twaburayo kimwe kyonka tukajaguza embaga y'omwaka",  # nyn
    "Tushangye",  # nyn
    "Ekitongore ekya UWA kiriyo nikibangura abanyamakuru kuruga omukyanga ekya Kigezi ahakuhandiika amakuru agakwatiraine n’okurinda enyamaishwa z’omukishaka.",  # nyn
    "itye nining",  # ach
    "Irii maber, ladit?",  # ach
    "Ibuto maber",  # ach
    "Ibuto Gulu",  # ach,
    "Amito tedo labolo kwon.",  # ach
    "Ngo manyen?",  # ach
    "Wacito I gang kwan nino ducu",  # ach
    "Kumbedi abedo Anaka i Amuru aa ki California i Amerika ento."  # ach
]



In [32]:
# Preprocess the new samples
X_test_new = tfidf_vectorizer.transform(new_samples)

In [33]:
def predict_and_print_results(model, new_samples):
    # Preprocess the new samples
    X_test_new = tfidf_vectorizer.transform(new_samples)

    # Predict using the provided model
    predictions = model.predict(X_test_new)

    # Print the results
    for sentence, language in zip(new_samples, predictions):
        print(f"Text: '{sentence}' => Predicted Language: {language}")


In [34]:
# Call the function for SVM model
print("SVM Model")
predict_and_print_results(svm_model, new_samples)


SVM Model
Text: 'I am going to the katale' => Predicted Language: eng
Text: 'Ngenda mu katale' => Predicted Language: lug
Text: 'Uganda' => Predicted Language: nyn
Text: 'Ekikopo' => Predicted Language: nyn
Text: 'Runyankole' => Predicted Language: nyn
Text: 'Omuntu' => Predicted Language: nyn
Text: 'Ebigambo by'amagezi' => Predicted Language: lug
Text: 'Bw'okoowa' => Predicted Language: nyn
Text: 'yiga' => Predicted Language: nyn
Text: 'kubivaako' => Predicted Language: nyn
Text: 'Mulindeeko tujja kubayita' => Predicted Language: lug
Text: 'Gamba Mpurire' => Predicted Language: lug
Text: 'Ekiro twaburayo kimwe kyonka tukajaguza embaga y'omwaka' => Predicted Language: nyn
Text: 'Tushangye' => Predicted Language: nyn
Text: 'Ekitongore ekya UWA kiriyo nikibangura abanyamakuru kuruga omukyanga ekya Kigezi ahakuhandiika amakuru agakwatiraine n’okurinda enyamaishwa z’omukishaka.' => Predicted Language: nyn
Text: 'itye nining' => Predicted Language: ach
Text: 'Irii maber, ladit?' => Predicte

In [35]:
# Call the function for Naive Bayes model
print("Naive Bayes Model")
predict_and_print_results(naive_bayes, new_samples)

Naive Bayes Model
Text: 'I am going to the katale' => Predicted Language: eng
Text: 'Ngenda mu katale' => Predicted Language: lug
Text: 'Uganda' => Predicted Language: ach
Text: 'Ekikopo' => Predicted Language: lug
Text: 'Runyankole' => Predicted Language: ach
Text: 'Omuntu' => Predicted Language: nyn
Text: 'Ebigambo by'amagezi' => Predicted Language: lug
Text: 'Bw'okoowa' => Predicted Language: nyn
Text: 'yiga' => Predicted Language: lug
Text: 'kubivaako' => Predicted Language: ach
Text: 'Mulindeeko tujja kubayita' => Predicted Language: lug
Text: 'Gamba Mpurire' => Predicted Language: lug
Text: 'Ekiro twaburayo kimwe kyonka tukajaguza embaga y'omwaka' => Predicted Language: nyn
Text: 'Tushangye' => Predicted Language: ach
Text: 'Ekitongore ekya UWA kiriyo nikibangura abanyamakuru kuruga omukyanga ekya Kigezi ahakuhandiika amakuru agakwatiraine n’okurinda enyamaishwa z’omukishaka.' => Predicted Language: nyn
Text: 'itye nining' => Predicted Language: ach
Text: 'Irii maber, ladit?' => 

In [36]:
# Preprocess the new samples using the same tokenizer and label encoder
X_new_sequences = tokenizer.texts_to_sequences(new_samples)
X_new_padded = pad_sequences(X_new_sequences, maxlen=X_padded.shape[1])  # Use the same maxlen as used in training

# Predict using the RNN model
rnn_predictions = model.predict(X_new_padded)
rnn_pred_labels = label_encoder.inverse_transform(rnn_predictions.argmax(axis=1))

print("\nRNN Model Predictions:")
for sentence, language in zip(new_samples, rnn_pred_labels):
    print(f"Text: '{sentence}' => Predicted Language: {language}")



RNN Model Predictions:
Text: 'I am going to the katale' => Predicted Language: eng
Text: 'Ngenda mu katale' => Predicted Language: lug
Text: 'Uganda' => Predicted Language: nyn
Text: 'Ekikopo' => Predicted Language: lug
Text: 'Runyankole' => Predicted Language: lug
Text: 'Omuntu' => Predicted Language: lug
Text: 'Ebigambo by'amagezi' => Predicted Language: nyn
Text: 'Bw'okoowa' => Predicted Language: lug
Text: 'yiga' => Predicted Language: lug
Text: 'kubivaako' => Predicted Language: lug
Text: 'Mulindeeko tujja kubayita' => Predicted Language: lug
Text: 'Gamba Mpurire' => Predicted Language: lug
Text: 'Ekiro twaburayo kimwe kyonka tukajaguza embaga y'omwaka' => Predicted Language: nyn
Text: 'Tushangye' => Predicted Language: lug
Text: 'Ekitongore ekya UWA kiriyo nikibangura abanyamakuru kuruga omukyanga ekya Kigezi ahakuhandiika amakuru agakwatiraine n’okurinda enyamaishwa z’omukishaka.' => Predicted Language: nyn
Text: 'itye nining' => Predicted Language: ach
Text: 'Irii maber, ladit