## Yoruba Language Detection

##### Final Year Project

### Importing Basic Libraries

In [21]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [95]:
df = pd.read_csv('language_detection.csv')
df.head()

Unnamed: 0,Text,Language,Unnamed: 2
0,lílo àkàbà ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó l...,Yoruba,
1,paul fẹ́ pààrọ̀ gílóòbù iná tó wà lóde ilé ẹ̀,Yoruba,
2,abbreviate,English,
3,abbreviation,English,
4,abdomen,English,


### Cleaning up the dataset, using string library 

In [28]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### A function that cleans the dataset

In [96]:
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun,"")
    text = text.lower()
    return(text)
        

### Trying out the function

In [78]:
remove_pun('"Nature can refer to the phenomena of the: 44##@! physical@."')

'nature can refer to the phenomena of the 44 physical'

In [31]:
remove_pun('"lílo àkàbà — ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó lè dáàbò bò ẹ́,? re"') # => Working wellwith yoruba alphabets

'lílo àkàbà — ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó lè dáàbò bò ẹ́ re'

### Applying the Function on our Dataset

###### This removes every punctuation in the dataset and converts to lowercase

In [97]:
df['Text'] = df['Text'].apply(remove_pun)

In [35]:
df.head()

Unnamed: 0,Text,Language,Unnamed: 2
0,lílo àkàbà — ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó...,Yoruba,
1,paul fẹ́ pààrọ̀ gílóòbù iná tó wà lóde ilé ẹ̀,Yoruba,
2,abbreviate,English,
3,abbreviation,English,
4,abdomen,English,


In [98]:
df.shape

(2306, 3)

### Dividing datasets to train and test

In [99]:
from sklearn.model_selection import train_test_split

In [100]:
X = df.iloc[:,0] # => Assigning the Texts to X
y =df.iloc[:,1] # => Assigning the Language Column
# X
# Y

### Assigning test and train data

In [101]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = .2)
# X_train,X_test,y_train,y_test

### Converting values to computer understandable version = Encoding

###### Vectorizing the dataset

In [83]:
from sklearn import feature_extraction

In [102]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char') # Unigrams and bigrams

In [85]:
from sklearn import pipeline
from sklearn import linear_model

### pipeline: creating a complete flow of functions (converting to vector and training) multpile steps

In [103]:
model_pipe = pipeline.Pipeline([('vec',vec),('clf', linear_model.LogisticRegression())])
# model_pipe

In [104]:
model_pipe.fit(X_train,y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

In [105]:
model_pipe.classes_

array(['English', 'Yoruba'], dtype=object)

In [106]:
predict_val = model_pipe.predict(X_test)
# predict_val


### Calculating the Accuracy 

In [67]:
from sklearn import metrics

In [107]:
metrics.accuracy_score(y_test,predict_val) #99% Accuracy, *100

0.9978354978354979

In [108]:
metrics.confusion_matrix(y_test,predict_val)

array([[148,   0],
       [  1, 313]], dtype=int64)

In [115]:
model_pipe.predict(['My name is osas'])

array(['English'], dtype=object)

### Saving as a pickle file

##### to be used on the web

In [116]:
import pickle

In [119]:
new_file = open('model.pckl', 'wb')
pickle.dump(model_pipe,new_file)
new_file.close()

## Thank You!!!