In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from langdetect import detect


In [28]:
# training data
data = pd.read_csv("../newThings/Dataset.csv")


In [29]:
texts = data["Text"]
labels = data["Language"]

In [30]:
# vectorize the training data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(texts)



In [31]:
# train the MNB model
mnb = MultinomialNB()
mnb.fit(X_train, labels)



MultinomialNB()

In [40]:
# input text string
text = "pen"



In [41]:
# predict the probabilities for each language class
X_test = vectorizer.transform([text])
probas = mnb.predict_proba(X_test)



In [42]:
# create a pandas dataframe to display the results
df = pd.DataFrame({'Language': mnb.classes_,
                   'Probability': probas[0]})



In [43]:
# sort the results by probability in descending order
df = df.sort_values(by=['Probability'], ascending=False)



In [44]:
# display the results as a formatted table
display(df)



Unnamed: 0,Language,Probability
1,Yoruba,0.600233
0,Not Yoruba,0.399767


In [45]:
# get the language with the highest probability
lang = mnb.classes_[probas.argmax()]



In [46]:
# detect the language of the input text using langdetect
langdetect_lang = detect(text)



In [47]:
print(f"Detected language(s) and probabilities:\n{df.to_string(index=False)}\n")

print(f"Detected language by MNB model: {lang}")
print(f"Detected language by langdetect: {langdetect_lang}")

Detected language(s) and probabilities:
  Language  Probability
    Yoruba     0.600233
Not Yoruba     0.399767

Detected language by MNB model: Yoruba
Detected language by langdetect: id
