## Yoruba Language Detection

##### Final Year Project

### Importing Basic Libraries

In [1]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
df = pd.read_csv('./language_detection-second.csv')
df.head()
df

Unnamed: 0,Text,Language,Unnamed: 2
0,lílo àkàbà ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó l...,Yoruba,
1,paul fẹ́ pààrọ̀ gílóòbù iná tó wà lóde ilé ẹ̀,Yoruba,
2,abbreviate,Not Yoruba,
3,abbreviation,Not Yoruba,
4,ink,Not Yoruba,
...,...,...,...
4287,nose,Not Yoruba,
4288,plural,Not Yoruba,
4289,anger,Not Yoruba,
4290,claim,Not Yoruba,


In [42]:
df["Text"].value_counts()

ṣe            6
mu            6
duro          6
ni            6
fun           5
             ..
ìwọṣọ         1
òjò dídì      1
yìnyín        1
irun ara      1
continent     1
Name: Text, Length: 4069, dtype: int64

### Cleaning up the dataset, using string library 

In [43]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### A function that cleans the dataset

In [44]:
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun,"")
    text = text.lower()
    return(text)
        

### Trying out the function

In [45]:
remove_pun('"Nature can refer to the phenomena of the: 44##@! physical@."')

'nature can refer to the phenomena of the 44 physical'

In [46]:
remove_pun('"lílo àkàbà — ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó lè dáàbò bò ẹ́,? re"') # => Working wellwith yoruba alphabets

'lílo àkàbà — ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó lè dáàbò bò ẹ́ re'

### Applying the Function on our Dataset

###### This removes every punctuation in the dataset and converts to lowercase

In [47]:
df['Text'] = df['Text'].apply(remove_pun)

In [48]:
df.head()

Unnamed: 0,Text,Language,Unnamed: 2
0,lílo àkàbà ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó l...,Yoruba,
1,paul fẹ́ pààrọ̀ gílóòbù iná tó wà lóde ilé ẹ̀,Yoruba,
2,abbreviate,Not Yoruba,
3,abbreviation,Not Yoruba,
4,ink,Not Yoruba,


In [49]:
df.shape

(4292, 3)

### Dividing datasets to train and test

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
X = df.iloc[:,0] # => Assigning the Texts to X
y =df.iloc[:,1] # => Assigning the Language Column
# X
y

0           Yoruba
1           Yoruba
2       Not Yoruba
3       Not Yoruba
4       Not Yoruba
           ...    
4287    Not Yoruba
4288    Not Yoruba
4289    Not Yoruba
4290    Not Yoruba
4291    Not Yoruba
Name: Language, Length: 4292, dtype: object

### Assigning test and train data

In [52]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = .2)
X_train,X_test,y_train,y_test

(352          groin 
 819     lòlọ́nàìtọ́
 3801       perhaps 
 3121         iberu 
 1895           egbò
            ...     
 3479          over 
 2534         mẹrin 
 3631         south 
 431      ice cream 
 1571          púpọ̀
 Name: Text, Length: 3433, dtype: object,
 4040        view 
 3902     trouble 
 3399        made 
 4097    neighbor 
 3122         oju 
           ...    
 1699         iyìn
 1004         yàrá
 1729        ìmúye
 277          eel 
 122       absent 
 Name: Text, Length: 859, dtype: object,
 352     Not Yoruba
 819         Yoruba
 3801    Not Yoruba
 3121        Yoruba
 1895        Yoruba
            ...    
 3479    Not Yoruba
 2534        Yoruba
 3631    Not Yoruba
 431     Not Yoruba
 1571        Yoruba
 Name: Language, Length: 3433, dtype: object,
 4040    Not Yoruba
 3902    Not Yoruba
 3399    Not Yoruba
 4097    Not Yoruba
 3122        Yoruba
            ...    
 1699        Yoruba
 1004        Yoruba
 1729        Yoruba
 277     Not Yoruba
 122     No

### Converting values to computer understandable version = Encoding

###### Vectorizing the dataset

In [53]:
from sklearn import feature_extraction

In [54]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char') # Unigrams and bigrams
vec

TfidfVectorizer(analyzer='char', ngram_range=(1, 2))

In [55]:
from sklearn import pipeline
from sklearn import linear_model

### Pipeline: creating a complete flow of functions (converting to vector and training) multpile steps

In [56]:
model_pipe = pipeline.Pipeline([('vec',vec),('clf', linear_model.LogisticRegression())])
# model_pipe

In [57]:
model_pipe.fit(X_train,y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

In [58]:
model_pipe.classes_

array(['Not Yoruba', 'Yoruba'], dtype=object)

In [59]:
predict_val = model_pipe.predict(X_test)
# predict_val

# 


# Trying other Models (Multinomial NB, Random Forest, KNN)

# .................................................................................................................................

### Multinomial Naive Bayes

In [60]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

# Initialize the Multinomial Naive Bayes classifier
mnb = MultinomialNB()

# load the dataset
df2 = pd.read_csv("language_detection-second.csv")

# Preprocess the text by converting to a bag-of-words representation
vectorizer = TfidfVectorizer()
A = vectorizer.fit_transform(df2['Text'])

# Convert the target labels to numerical values
b = pd.factorize(df2['Language'])[0]

# Split the dataset into training and validation sets
A_train, A_val, b_train, b_val = train_test_split(A, b, test_size=0.2, random_state=42, stratify=y)



# Fit the classifier to the training data
mnb.fit(A_train, b_train)


MultinomialNB()

In [61]:
# Evaluating the performance

# Predict the language for each sample in the validation set
b_pred = mnb.predict(A_val)

# Compute the accuracy of the predictions
accuracy = accuracy_score(b_val, b_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.70


### Multinomial NB Result

In [62]:
# Predict the language of new text
sampleText = 'My name is Damilola'

NBlanguage_label = mnb.predict(vectorizer.transform([sampleText]))
NBlanguage = pd.Categorical.from_codes(NBlanguage_label, df2['Language'].unique())[0]
print(f'The language of "{sampleText}" is {NBlanguage}')


The language of "My name is Damilola" is Not Yoruba


# .................................................................................................................................

### Random Forest

In [63]:
# import the necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# load and preprocess the dataset
df3 = pd.read_csv("language_detection-second.csv")
vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer="char")
C = vectorizer.fit_transform(df3["Text"])
d = df["Language"]

# train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(C, d)

# preprocess your input
# my_input_vectorized = vectorizer.transform([my_input])

# use your model to make predictions
# predicted_language = rf.predict(my_input_vectorized)

# print the predicted output
# print("the language is" ,(predicted_language))

# Make predictions on the testing data
predictions = rf.predict(C)

# Calculate the accuracy score of the random forest classifier
accuracy = accuracy_score(d, predictions)

# Print the accuracy score
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 99.86%


### RF RESULT

In [78]:
my_input2 = "Mofẹ́ pààrọ̀ gílóòbù iná"

# Sample text = Let's Party hard tonight, Mofẹ́ pààrọ̀ gílóòbù iná, My father is the cousin to England's present Queen

my_input_vectorized2 = vectorizer.transform([my_input2]) #= Preprocessing
predicted_language2 = rf.predict(my_input_vectorized2) #= using model

print("the language is" ,(predicted_language2))

the language is ['Yoruba']


# .................................................................................................................................

## TensorFlow

# .................................................................................................................................

### K-Nearest Neighbour model

In [68]:
# import the important libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

# Load the text data into a pandas DataFrame
dataKNN = pd.read_csv('language_detection-second.csv')

# Preprocess the text by converting to a bag-of-words representation
vectorizer1 = TfidfVectorizer()
E = vectorizer1.fit_transform(dataKNN['Text'])

# Convert the target labels to numerical values
f = pd.factorize(dataKNN['Language'])[0]

# Split to test, train 
E_train, E_val, f_train, f_val = train_test_split(E, f, test_size=0.2, stratify=y)

# Initialize the KNN classifier
knn2 = KNeighborsClassifier(n_neighbors=5)

# Fit the classifier to the training data
knn2.fit(E_train, f_train)

# Predict the language for each sample in the validation set
f_pred = knn2.predict(E_val)



# Was having issues witht the transform(), i had to transform the text to be predicted
# checking the size of the trained data

# print(E_train.shape)
# print(E_val.shape)


# Create a dictionary that maps numerical labels to language names
label_to_language = {0: 'Yoruba', 1: 'Not Yoruba'}


# Compute the accuracy of the predictions
accuracy = sum(f_pred == f_val) / len(f_val)
print(f'Accuracy: {accuracy:.2f}')

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy: 0.64


### KNN RESULT

In [69]:
# Predict the language of new text
predictText = 'Mofẹ́ pààrọ̀ gílóòbù iná'

# sample text = My father is the cousin to England's present Queen, Bonjour, comment allez-vous?

language_label = knn2.predict(vectorizer1.transform([predictText]))[0]
mainLanguage = label_to_language[language_label]
print(f'The language of "{predictText}" is {mainLanguage}')


The language of "Mofẹ́ pààrọ̀ gílóòbù iná" is Yoruba


# .................................................................................................................................

# 


### Calculating the Accuracy  of the main model

In [70]:
from sklearn import metrics

In [71]:
print(f"Accuracy: {metrics.accuracy_score(y_test,predict_val)*100}") 

Accuracy: 97.78812572759023


## 

In [72]:
metrics.confusion_matrix(y_test,predict_val)

array([[323,  11],
       [  8, 517]], dtype=int64)

In [73]:
model_pipe.predict(['My name is osas']) 

array(['Not Yoruba'], dtype=object)

In [77]:
model_pipe.predict(['pin']) # => Wrong

array(['Yoruba'], dtype=object)

In [None]:
# if model_pipe.predict(['ílóòbù iná tó wà']) != 'Yoruba' or model_pipe.predict(['Marvelous']) != 'English':
#     error = ('Error: Not a yoruba or English text')
#     print(error)

### Saving as a pickle file

##### to be used on the web

In [None]:
import pickle

In [None]:
# new_file = open('model.pckl', 'wb')
new_file = open('model.pkl', 'wb')
pickle.dump(model_pipe,new_file)
new_file.close()

## Thank You!!!