### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.simplefilter("ignore")

In [5]:
data = pd.read_csv("../scrapped_data/language_detection_dataset.csv")

In [6]:
data.head()

Unnamed: 0,Sentence,Language
0,"L'aspect expérimental reste central en chimie,...",fr
1,Principen om superponering anger att ett sedim...,sv
2,Οι έρευνες του Τζαν Σουάμμερνταμ στην εντομολο...,el
3,La géologie est une science comprenant de nomb...,fr
4,A Química Inorgânica tem aplicações em todos o...,pt


In [7]:
data["Language"].value_counts()

en    1464
pt    1367
ar    1187
es    1152
da     996
fr     921
tr     880
sv     760
it     701
ru     658
ml     518
nl     509
el     484
ta     430
hi     111
Name: Language, dtype: int64

In [8]:
le = LabelEncoder()
y = le.fit_transform(data["Language"])

x_train, x_test, y_train, y_test = train_test_split(data["Sentence"], y, test_size=0.2, random_state=42)

def preprocess_text(text):
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    return text.lower()

x_train = x_train.apply(preprocess_text)
x_test = x_test.apply(preprocess_text)

# Define the pipeline: TF-IDF + Naïve Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Step 1: TF-IDF transformation
    ('model', MultinomialNB())     # Step 2: Train the Naïve Bayes model
])

pipeline.fit(x_train, y_train)

y_pred = pipeline.predict(x_test)

In [9]:
le.classes_

array(['ar', 'da', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ml', 'nl', 'pt',
       'ru', 'sv', 'ta', 'tr'], dtype=object)

### Evaluating the model

In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [11]:
print("Accuracy is :", round(ac,3))

Accuracy is : 0.965


In [12]:
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       220
           1       0.97      0.91      0.94       208
           2       1.00      0.99      1.00       114
           3       0.80      1.00      0.89       291
           4       0.98      0.94      0.96       203
           5       1.00      0.99      0.99       195
           6       1.00      0.72      0.84        25
           7       1.00      0.96      0.98       140
           8       1.00      0.96      0.98       107
           9       1.00      0.93      0.96       103
          10       1.00      1.00      1.00       270
          11       1.00      0.93      0.96       125
          12       0.99      0.97      0.98       154
          13       1.00      0.99      0.99        96
          14       1.00      0.99      0.99       177

    accuracy                           0.97      2428
   macro avg       0.98      0.95      0.96      2428
weighted avg       0.97   

### Model Saving

In [13]:
with open("../fastAPI/app/model/pipeline-0.1.0.pkl", "wb") as f:
    pickle.dump(pipeline, f)

with open("../Flask/app/model/pipeline-0.1.0.pkl", "wb") as f:
    pickle.dump(pipeline, f)
    
with open("../fastAPI/app/model/encoder-0.1.0.pkl", "wb") as f:
    pickle.dump(le, f)

with open("../Flask/app/model/encoder-0.1.0.pkl", "wb") as f:
    pickle.dump(le, f)

### Model Load and Scoring

In [14]:
import pickle

with open("pipeline.pkl", "rb") as f:
    loaded_pipeline = pickle.load(f)

with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

In [15]:
# Make predictions with the loaded pipeline
new_sentences = ["This is a test sentence"]
predictions = loaded_pipeline.predict(new_sentences)

print(predictions)

[3]


In [16]:
decoded_predictions = le.inverse_transform(predictions)
decoded_predictions

array(['en'], dtype=object)

In [17]:
decoded_predictions[0]

'en'