In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [37]:
# load the Yoruba dataset
data = pd.read_csv('Dataset.csv')

In [42]:
# Preprocess text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Text'])
y = data['Language']
X

<4285x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 10774 stored elements in Compressed Sparse Row format>

In [44]:
# Split data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# X_train, X_test, y_train, y_test

(<3428x5000 sparse matrix of type '<class 'numpy.int64'>'
 	with 8721 stored elements in Compressed Sparse Row format>,
 <857x5000 sparse matrix of type '<class 'numpy.int64'>'
 	with 2053 stored elements in Compressed Sparse Row format>,
 4235    Not Yoruba
 2620        Yoruba
 2646        Yoruba
 961         Yoruba
 3267        Yoruba
            ...    
 622     Not Yoruba
 3931    Not Yoruba
 2793        Yoruba
 822         Yoruba
 1435        Yoruba
 Name: Language, Length: 3428, dtype: object,
 1403        Yoruba
 3790    Not Yoruba
 2714        Yoruba
 3253        Yoruba
 2022        Yoruba
            ...    
 2882        Yoruba
 1647        Yoruba
 491     Not Yoruba
 584     Not Yoruba
 2859        Yoruba
 Name: Language, Length: 857, dtype: object)

In [15]:
# extract features from the text
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [16]:
# train the model
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [33]:
# evaluate the model

# y_pred = model.predict(X_test)
# y_pred

accuracy = accuracy_score(y_test, y_pred)
# accuracy

precision = precision_score(y_test, y_pred, pos_label='Yoruba')
# precision 

recall = recall_score(y_test, y_pred, pos_label='Yoruba')
# recall

f1 = f1_score(y_test, y_pred, pos_label='Yoruba')
# f1



In [32]:
print('Accuracy: {:.2f}%'.format(accuracy*100))
print('Precision: {:.2f}%'.format(precision*100))
print('Recall: {:.2f}%'.format(recall*100))
print('F1 Score: {:.2f}%'.format(f1*100))

Accuracy: 68.73%
Precision: 66.25%
Recall: 99.81%
F1 Score: 79.64%


In [35]:
# load some sample test data
test_data = [
    'Mo ní ìrànwọ́ lórí ẹ̀dá tí n wọ̀n ti kókó ìwé',
    'Bí a bá ṣe kó ṣe fún ẹ̀ wọ̀nyíí?',
    'Ẹ jọ̀wọ́, jọ̀wọ́ fún mi lọ́wọ́ o',
    'Mo lọ sí ìlú Ìbàdàn', 
    'Ẹ n lọ sí kínì sí ẹ̀kọ́ tàbí ìpèsẹ́',
    'bit',
    'shit',
    'smart',
    'rice',
    'pen',
]

# clean the test data
cleaned_test_data = []
for text in test_data:
    cleaned_text = clean_text(text)
    cleaned_test_data.append(cleaned_text)

# extract features from the test data
test_data_features = vectorizer.transform(cleaned_test_data)

# use the model to predict the language of the test data
predicted_labels = model.predict(test_data_features)

# print the predicted labels
for i in range(len(test_data)):
    print('Text:', test_data[i])
    print('Predicted label:', predicted_labels[i])
    print('---------------------------')

Text: Mo ní ìrànwọ́ lórí ẹ̀dá tí n wọ̀n ti kókó ìwé
Predicted label: Yoruba
---------------------------
Text: Bí a bá ṣe kó ṣe fún ẹ̀ wọ̀nyíí?
Predicted label: Yoruba
---------------------------
Text: Ẹ jọ̀wọ́, jọ̀wọ́ fún mi lọ́wọ́ o
Predicted label: Yoruba
---------------------------
Text: Mo lọ sí ìlú Ìbàdàn
Predicted label: Yoruba
---------------------------
Text: Ẹ n lọ sí kínì sí ẹ̀kọ́ tàbí ìpèsẹ́
Predicted label: Yoruba
---------------------------
Text: bit
Predicted label: Not Yoruba
---------------------------
Text: shit
Predicted label: Yoruba
---------------------------
Text: smart
Predicted label: Yoruba
---------------------------
Text: rice
Predicted label: Yoruba
---------------------------
Text: pen
Predicted label: Yoruba
---------------------------
