### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.simplefilter("ignore")

In [2]:
# Loading the dataset
data = pd.read_csv("language_detection_dataset.csv")

In [3]:
data.head()

Unnamed: 0,Sentence,Language
0,Hvis 6 mol dihydrogen (brint) reagerer fuldstæ...,da
1,Essa evolução na física ganhou ares de uma rev...,pt
2,"L'intero sistema per la misurazione, il tratta...",it
3,Estes se tornaram uma grande dor de cabeça com...,pt
4,"Entretanto, o Modelo Padrão não é capaz de des...",pt


In [4]:
# value count for each language
data["Language"].value_counts()

pt    1883
en    1732
ar    1187
es    1152
da     996
fr     923
tr     880
ml     788
sv     760
it     701
ru     658
ta     537
nl     509
el     482
hi     130
Name: Language, dtype: int64

In [5]:
# Label Encoding
le = LabelEncoder()
y = le.fit_transform(data["Language"])

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(data["Sentence"], y, test_size=0.2, random_state=42)


def preprocess_text(text):
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    return text.lower()

x_train = x_train.apply(preprocess_text)
x_test = x_test.apply(preprocess_text)

# Define the pipeline: TF-IDF + Naïve Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Step 1: TF-IDF transformation
    ('model', MultinomialNB())     # Step 2: Train the Naïve Bayes model
])

# Train the pipeline
pipeline.fit(x_train, y_train)

# Make predictions
y_pred = pipeline.predict(x_test)

### Evaluating the model

In [6]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [7]:
print("Accuracy is :",ac)

Accuracy is : 0.9688438438438438


In [31]:
# classification report
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.97      0.99       242
           1       0.98      0.91      0.94       220
           2       1.00      0.94      0.97        80
           3       0.97      0.99      0.98       336
           4       1.00      0.90      0.95       221
           5       1.00      0.97      0.98       192
           6       1.00      1.00      1.00        17
           7       1.00      0.96      0.98       147
           8       1.00      0.99      0.99       140
           9       1.00      0.94      0.97       105
          10       0.98      0.99      0.99       390
          11       0.67      0.99      0.80       136
          12       0.98      0.98      0.98       156
          13       1.00      0.99      1.00       108
          14       1.00      0.96      0.98       174

    accuracy                           0.97      2664
   macro avg       0.97      0.97      0.97      2664
weighted avg       0.97   

### Model Saving

In [12]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)  # `le` is your fitted LabelEncoder

with open("pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

In [14]:
with open("pipeline.pkl", "rb") as f:
    loaded_pipeline = pickle.load(f)

with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

In [10]:
# Make predictions with the loaded pipeline
new_sentences = ["This is a test sentence.", "Ceci est une phrase en français."]
predictions = loaded_pipeline.predict(new_sentences)

print(predictions)  # Output: Language labels

[3 5]


In [15]:
decoded_predictions = le.inverse_transform(predictions)
decoded_predictions

array(['en', 'fr'], dtype=object)