### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.simplefilter("ignore")

In [3]:
# Loading the dataset
data = pd.read_csv("../scrapped_data/language_detection_dataset.csv")

In [4]:
data.head()

Unnamed: 0,Sentence,Language
0,"Ciascuno di questi ambiti, in cui convenzional...",it
1,"​ De esta forma, es posible referirse a los co...",es
2,ജീവിതം സുഖപ്രദവും അർത്ഥവത്തുമാക്കുന്നതിനുപകരിക...,ml
3,Een bekend voorbeeld daarvan is de klassieke m...,nl
4,النسبية العامة هي نظرية ذات طابع هندسي، توصل إ...,ar


In [5]:
# value count for each language
data["Language"].value_counts()

en    1483
pt    1367
ar    1187
es    1152
da     996
fr     914
tr     880
sv     760
it     701
ru     655
ml     518
nl     509
el     484
ta     430
hi     111
Name: Language, dtype: int64

In [6]:
# Label Encoding
le = LabelEncoder()
y = le.fit_transform(data["Language"])

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(data["Sentence"], y, test_size=0.2, random_state=42)


def preprocess_text(text):
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    return text.lower()

x_train = x_train.apply(preprocess_text)
x_test = x_test.apply(preprocess_text)

# Define the pipeline: TF-IDF + Naïve Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Step 1: TF-IDF transformation
    ('model', MultinomialNB())     # Step 2: Train the Naïve Bayes model
])

# Train the pipeline
pipeline.fit(x_train, y_train)

# Make predictions
y_pred = pipeline.predict(x_test)

In [7]:
le.classes_

array(['ar', 'da', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ml', 'nl', 'pt',
       'ru', 'sv', 'ta', 'tr'], dtype=object)

### Evaluating the model

In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [12]:
print("Accuracy is :", round(ac,3))

Accuracy is : 0.966


In [13]:
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       228
           1       0.99      0.91      0.95       181
           2       1.00      0.98      0.99        92
           3       0.81      1.00      0.89       310
           4       1.00      0.91      0.95       232
           5       0.99      0.99      0.99       198
           6       1.00      0.56      0.72        25
           7       1.00      0.95      0.97       155
           8       1.00      0.97      0.99       111
           9       1.00      0.96      0.98       104
          10       0.99      1.00      0.99       263
          11       1.00      0.98      0.99       121
          12       0.99      1.00      1.00       141
          13       1.00      1.00      1.00        91
          14       1.00      0.97      0.98       178

    accuracy                           0.97      2430
   macro avg       0.98      0.94      0.96      2430
weighted avg       0.97   

### Model Saving

In [15]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)  # `le` is your fitted LabelEncoder

with open("pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)
    
with open("../app/model/pipeline-0.1.0.pkl", "wb") as f:
    pickle.dump(pipeline, f)

with open("../app/model/encoder-0.1.0.pkl", "wb") as f:
    pickle.dump(le, f)

### Model Load and Scoring

In [16]:
import pickle

with open("pipeline.pkl", "rb") as f:
    loaded_pipeline = pickle.load(f)

with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

In [17]:
# Make predictions with the loaded pipeline
new_sentences = ["This is a test sentence"]
predictions = loaded_pipeline.predict(new_sentences)

print(predictions)

[3]


In [18]:
decoded_predictions = le.inverse_transform(predictions)
decoded_predictions

array(['en'], dtype=object)

In [19]:
decoded_predictions[0]

'en'