# Yoruba Language Identifier

### Importing the pandas library

In [1]:
import pandas as pd

### Loading the dataset

In [2]:
sets = pd.read_csv('./newDataset.csv')
sets

Unnamed: 0,Text,Language
0,"And I say: If you don't know you're a slave, ...",Not Yoruba
1,And she's right.,Not Yoruba
2,"Because it's scary and it's expensive, but we...",Not Yoruba
3,"I want them to think, Well there's some hope.",Not Yoruba
4,It took only three generations.,Not Yoruba
...,...,...
24394,nose,Not Yoruba
24395,plural,Not Yoruba
24396,anger,Not Yoruba
24397,claim,Not Yoruba


### Checking data for NAN values

In [3]:
print(sets.isnull().sum())

Text         0
Language    80
dtype: int64


### Replacing NaN values with default values

In [4]:
sets.fillna(sets.mean(), inplace=True)

  sets.fillna(sets.mean(), inplace=True)


### Remove rows and columns that contain Nan values

In [5]:
sets.dropna(inplace=True)

### Displaying the dataset

In [6]:
sets

Unnamed: 0,Text,Language
0,"And I say: If you don't know you're a slave, ...",Not Yoruba
1,And she's right.,Not Yoruba
2,"Because it's scary and it's expensive, but we...",Not Yoruba
3,"I want them to think, Well there's some hope.",Not Yoruba
4,It took only three generations.,Not Yoruba
...,...,...
24394,nose,Not Yoruba
24395,plural,Not Yoruba
24396,anger,Not Yoruba
24397,claim,Not Yoruba


In [7]:
sets.tail()

Unnamed: 0,Text,Language
24394,nose,Not Yoruba
24395,plural,Not Yoruba
24396,anger,Not Yoruba
24397,claim,Not Yoruba
24398,continent,Not Yoruba


## MULTINOMIAL NAIVE BAYES MODEL

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [9]:
# Initialize the Multinomial Naive Bayes classifier
mnb = MultinomialNB()

In [10]:
vectorizer = TfidfVectorizer()
A = vectorizer.fit_transform(sets['Text'])

In [12]:
# Convert the target labels to numerical values
b = pd.factorize(sets['Language'])[0]

In [13]:
# Split the dataset into training and validation sets
A_train, A_val, b_train, b_val = train_test_split(A, b, test_size=0.2, random_state=42, stratify=b)

In [14]:
# Fit the classifier to the training data
mnb.fit(A_train, b_train)

MultinomialNB()

In [16]:
# Predict the language for each sample in the validation set
b_pred = mnb.predict(A_val)

In [17]:
# Compute the accuracy of the predictions
accuracy = accuracy_score(b_val, b_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.98


In [22]:
# sampleTexts = ['going to lagos', 'ooni', 'fish', 'pin', 'sharpener', 'one', 'mono', 'monospace', '', '' ]

In [54]:
# Predict the language of new text
sampleText = 'Guerra'

NBlanguage_label = mnb.predict(vectorizer.transform([sampleText]))
NBlanguage = pd.Categorical.from_codes(NBlanguage_label, sets['Language'].unique())[0]
print(f'The language of "{sampleText}" is {NBlanguage}')


The language of "Guerra" is Yoruba
