## Preprocès: ##

- Eliminar els digits del text
- Convertir tot el text a minúscula
- Substitueix els espais en blanc continus per un de sol
- Concatena totes les frases amb un espai doble al mig 


In [1]:
import re 
from nltk.collocations import TrigramCollocationFinder, ngrams
from collections import Counter, defaultdict
import os
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import glob

In [109]:
def preprocess_text(text):
    text = re.sub(r'\d+', '', text) 
    text = text.lower()  
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'\n', '  ', text)  
    return text

def load_and_preprocess_data(data_path):
    with open(data_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return preprocess_text(text)


languages = ['deu', 'eng', 'fra', 'ita', 'nld', 'spa']
path = 'C:/Users/Paess/Documents/Uni/Segon/PLH/langId'
train = {}
test = []

for idiom in languages:
    file_path = os.path.join(path, f'{idiom}_trn.txt')
    text = load_and_preprocess_data(file_path)
    train[idiom] = text

import re

def load_data_test(filenames):
    texts = []
    labels = []
    for file in filenames:
        label = os.path.basename(file).split('_')[0]  # The tag is the first part of the filename
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                texts.append(line.strip())
                labels.append(label)
    return texts, labels

test_files = glob.glob('*_tst.txt')
X_test, y_test = load_data_test(test_files)


train_split = {}
val_split = {}

for idiom in languages:
    text = train[idiom]
    split_index = int(len(text) * 0.8)
    train_split[idiom] = text[:split_index]
    val_split[idiom] = text[split_index:]



In [110]:
print(len(X_test))

59977


In [None]:
print(train_split['eng'][:20])

 • • • . .% how the 


In [111]:
def generate_trigrams(text):
    
    lletres = list(text)  # Convierte el texto en una lista de caracteres
    trigrams = [''.join(trigram) for trigram in ngrams(lletres, 3) if len(trigram) == 3]
    return trigrams

def frequent_trigrams(trigrams):
    trigram_counts = Counter(trigrams)
    frequent_trigrams_set = {trigram for trigram, count in trigram_counts.items() if count >= 5}
    return [trigram for trigram in trigrams if trigram in frequent_trigrams_set]



for idiom in train_split: 
    train_split[idiom] = frequent_trigrams(generate_trigrams(train_split[idiom]))
    val_split[idiom] = frequent_trigrams(generate_trigrams(val_split[idiom]))
    


In [112]:
info_lang = {}
for lang, trigrams in train_split.items():
    NT = len(trigrams)
    B = 30**3
    info_lang[lang] = {'NT': NT, 'B': B}

LID = Ct(ej)+lambda/Nt+lambdaB, B= # trigrames diferents Nt= # trigrames 

In [102]:
def LID(count_trigram, NT, B, lam=0.5):
    result= np.log((count_trigram + lam) / (NT + lam * B))
    return result


In [103]:
def entrenament_model(train, lam=0.5):
    trigram_counts = {trigram: {lang: 0 for lang in languages} for lang in languages for trigram in train.get(lang, [])}
    
    for lang, lang_trigrams in train.items():
        for trigram in lang_trigrams:
            trigram_counts[trigram][lang] += 1
    
    trigram_LID = {}
    for trigram, lang_counts in trigram_counts.items():
        trigram_LID[trigram] = {lang: LID(count, info_lang[lang]['NT'], info_lang[lang]['B'], lam) for lang, count in lang_counts.items()}
    
    return trigram_LID

In [104]:
import numpy as np

def predict(text, trigram_LID, lam=0.5):
    text_preprocessat = preprocess_text(text)  
    trigrams = generate_trigrams(text_preprocessat) 
    
    LID_per_lang = {lang: 0 for lang in info_lang}  
    
    for trigram in trigrams:
        if trigram in trigram_LID:
            LID_per_lang_trigram = trigram_LID[trigram]  
        else:
            LID_per_lang_trigram = {lang: LID(0, info_lang[lang]['NT'], info_lang[lang]['B'], lam) for lang in info_lang}
        for lang, LID_trigram in LID_per_lang_trigram.items():
            LID_per_lang[lang] += LID_trigram 

    min_val = min(LID_per_lang.values())
    max_val = max(LID_per_lang.values())
    
    if max_val == min_val:  # Evitar división por cero
        probabilities = {lang: float(100 / len(LID_per_lang)) for lang in LID_per_lang}
    else:
        probabilities = {lang: float(((score - min_val) / (max_val - min_val)) * 100) for lang, score in LID_per_lang.items()}  

    # Idioma con mayor puntaje
    max_language = max(probabilities, key=probabilities.get)  

    return probabilities, max_language

In [113]:
trigram_LID = entrenament_model(train_split, 0.5)


In [114]:
text = "Hola, com estás?"

print(predict(text, trigram_LID, 0.5))

({'deu': 0.0, 'eng': 35.04509486508014, 'fra': 35.185960429637674, 'ita': 44.41237071387217, 'nld': 5.242111175603981, 'spa': 100.0}, 'spa')


In [107]:
# Function to evaluate the results of the LID system by comparing the predicted languages with the actual languages

def evaluate_results(X_test, y_test, trigram_LID, lam=0.5, show_missclassified=False):
    y_pred = []
    for sentence, actual_lang in zip(X_test, y_test):
        _, lang = predict(sentence, trigram_LID, lam)
        y_pred.append(lang)
        if show_missclassified and lang != actual_lang:
            print(f"Misclassified sentence: {sentence}")
            print(f"Predicted language: {lang}")
            print(f"Actual language: {actual_lang}")
            print("---")
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    return accuracy, precision, f1, conf_matrix

In [115]:
# Predicctions for the test data and calculation of the accuracy and the confusion matrix


accuracy, precision, f1, conf_matrix = evaluate_results(X_test, y_test, trigram_LID, 0.5)

# Mostrar métricas de evaluación
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision}')
print(f'F1-score: {f1:.4f}')

# # Define the class labels
# classes = np.unique(y_test)


# # Plot the confusion matrix
# plt.figure(figsize=(8, 6))
# sns.set(font_scale=1.2)
# sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 12}, cmap="Blues", fmt=".2f", xticklabels=classes, yticklabels=classes)
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.title('Confusion Matrix')
# plt.show()


Accuracy: 0.9989
Precision: [0.9989992  0.99730297 0.9993997  0.99919936 0.9989987  0.9993    ]
F1-score: 0.9989
