# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Load the Dataset

In [50]:
dataset = pd.read_csv('../newThings/newDataset.csv')

# Summarize the Dataset

In [51]:
dataset.shape
dataset.head

<bound method NDFrame.head of                                                     Text Language
0       And I say: If you don't know you're a slave, ...  English
1                                       And she's right.  English
2       Because it's scary and it's expensive, but we...  English
3          I want them to think, Well there's some hope.  English
4                        It took only three generations.  English
...                                                  ...      ...
33827                                             pencil  English
33828                                               shit  English
33829  Sharpener, Eraser, stationeries are stationary...  English
33830  Who fries, fried, jammed, jam and stock, rathe...  English
33831          flattery happens all the time here, yunno  English

[33832 rows x 2 columns]>

# Finding the Null Values

In [52]:
# get the boolean mask of null values
mask = dataset.isnull()

# get the indices of the True values in the boolean mask
indices = np.where(mask)

# print the indices
print(indices)


(array([  427,   460,   701,   703,  1030,  1201,  1955,  1959,  2349,
        2414,  2466,  2574,  2794,  3503,  3522,  3736,  3746,  4326,
        4655,  4858,  4983,  4988,  5028,  5054,  5145,  6761,  6791,
        7581,  7685,  7702,  7800,  8065,  8515,  8601,  9286,  9440,
        9567,  9578,  9636,  9661,  9697,  9803,  9846,  9874, 10014,
       10045, 10311, 10314, 10968, 11083, 11957, 12017, 14308, 15311,
       15333, 15417, 15439, 15779, 15808, 15928, 16431, 16529, 16561,
       16887, 16949, 16980, 17013, 17053, 17559, 18099, 18298, 18382,
       18673, 18687, 19019, 19272, 19298, 19605, 19681, 19757],
      dtype=int64), array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64))


In [53]:
dataset.isnull().sum()

# Replace NaN values with the mean of the column
dataset.fillna(dataset.mean(), inplace=True)

# Remove rows with NaN values
dataset.dropna(inplace=True)


  dataset.fillna(dataset.mean(), inplace=True)


# Languages in the Dataset

In [54]:
dataset.isnull().sum()

Text        0
Language    0
dtype: int64

In [55]:
dataset["Language"].value_counts()

English       13166
Yoruba        12539
French         1014
Spanish         819
Portugeese      739
Italian         698
Russian         692
Sweedish        676
Malayalam       594
Dutch           546
Turkish         474
German          470
Tamil           469
Danish          428
Greek           365
Hindi            63
Name: Language, dtype: int64

# Split input and then Test and Train

In [56]:
x = np.array(dataset["Text"])
y = np.array(dataset["Language"])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

CountVectorizer is a great tool provided by the Scikik-learn Library in Python. It is used to transform a given text into a ventor on the basis of the frequency (count) of each word that occurs in the entire text

In [57]:
model = MultinomialNB()
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9701948110243289

In [62]:
probas = model.predict_proba(X_test)

In [63]:
for i, lang in enumerate(model.classes_):
    print(f"{lang}: {probas[0][i]:.2%}")

Danish: 0.00%
Dutch: 0.00%
English: 0.00%
French: 0.00%
German: 0.00%
Greek: 0.00%
Hindi: 0.00%
Italian: 0.00%
Malayalam: 0.00%
Portugeese: 0.00%
Russian: 0.00%
Spanish: 0.00%
Sweedish: 0.00%
Tamil: 0.00%
Turkish: 0.00%
Yoruba: 100.00%


As this is a problem of multiclass classification, so we will be using the Multinomial Naive Bayes algorithm to train the language detection model. This algorithm always perform very well on the problem based on multiclass classification.

sampleTexts = 'going to lagos', 'ooni', 'fish', 'pin', 'sharpener', 'one', 'mono', 'monospace', '', '' 

In [71]:
# user = input("Enter a Text: ")
user = "ooni"
dataset = cv.transform([user]).toarray()
output = model.predict(dataset)

print(user)
print(output)

ooni
['English']
