## Preprocès: ##

- Eliminar els digits del text
- Convertir tot el text a minúscula
- Substitueix els espais en blanc continus per un de sol
- Concatena totes les frases amb un espai doble al mig 


In [40]:
import re 
from nltk.collocations import TrigramCollocationFinder, ngrams
from collections import Counter, defaultdict
import os
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [None]:
def preprocess_text(text):
    text = re.sub(r'\d+', '', text) 
    text = text.lower()  
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'\n', '  ', text)  
    return text

def load_and_preprocess_data(data_path):
    with open(data_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return preprocess_text(text)


languages = ['deu', 'eng', 'fra', 'ita', 'nld', 'spa']
path = 'C:/Users/Paess/Documents/Uni/Segon/PLH/langId'
train = {}
test = []

for idiom in languages:
    file_path = os.path.join(path, f'{idiom}_trn.txt')
    text = load_and_preprocess_data(file_path)
    train[idiom] = text

for idiom in languages:
    file_path = os.path.join(path, f'{idiom}_tst.txt')
    texts = load_and_preprocess_data(file_path).split('  ')
    test.extend([(text, idiom) for text in texts])
    X_test, y_test = zip(*test)


train_split = {}
val_split = {}

for idiom in languages:
    text = train[idiom]
    split_index = int(len(text) * 0.8)
    train_split[idiom] = text[:split_index]
    val_split[idiom] = text[split_index:]



In [None]:
print(train_split['eng'][:20])

 • • • . .% how the 


In [None]:
def generate_trigrams(text):
    
    lletres = list(text)  # Convierte el texto en una lista de caracteres
    trigrams = [''.join(trigram) for trigram in ngrams(lletres, 3) if len(trigram) == 3]
    
    print(f"Ejemplo de trigramas generados para el texto: {text[:50]} ...")  # Depuración
    print(f"Primeros trigramas generados: {trigrams[:10]}\n")  # Depuración
    print(f"Primeros trigramas generados: {trigrams}\n")  # Depuración
    
    return trigrams

def frequent_trigrams(trigrams):
    trigram_counts = Counter(trigrams)
    frequent_trigrams_set = {trigram for trigram, count in trigram_counts.items() if count >= 5}
    return [trigram for trigram in trigrams if trigram in frequent_trigrams_set]



for idiom in train_split: 
    train_split[idiom] = frequent_trigrams(generate_trigrams(train_split[idiom]))
    val_split[idiom] = frequent_trigrams(generate_trigrams(val_split[idiom]))
    


In [47]:
print(train_split['deu'][:10])
print(train_split['eng'][:10])
print(train_split['fra'][:10])


[' ..', '.. ', '. w', ' wi', 'wis', 'iss', 'sse', 'sen', 'ens', 'nsc']
[' • ', ' • ', ' • ', ' . ', '. .', ' .%', '.% ', ' ho', 'how', 'ow ']
[' le', 'le ', 'e p', ' pr', 'pré', 'rés', 'ési', 'sid', 'ide', 'den']


In [48]:
for lang, trigrams in train_split.items():
    print(f"Idioma: {lang}, Número de trigramas: {len(trigrams)}")

Idioma: deu, Número de trigramas: 2683074
Idioma: eng, Número de trigramas: 2870839
Idioma: fra, Número de trigramas: 2918039
Idioma: ita, Número de trigramas: 3024687
Idioma: nld, Número de trigramas: 2496599
Idioma: spa, Número de trigramas: 3131846


In [49]:
info_lang = {}
for lang, trigrams in train_split.items():
    NT = len(trigrams)
    B = 30**3
    info_lang[lang] = {'NT': NT, 'B': B}

LID = Ct(ej)+lambda/Nt+lambdaB, B= # trigrames diferents Nt= # trigrames 

In [50]:
def LID(count_trigram, NT, B, lam=0.5):
    return np.log((count_trigram + lam) / (NT + lam * B))


In [51]:
def entrenament_model(train, lam=0.5):
    trigram_counts = {trigram: {lang: 0 for lang in languages} for lang in languages for trigram in train.get(lang, [])}
    
    for lang, lang_trigrams in train.items():
        for trigram in lang_trigrams:
            trigram_counts[trigram][lang] += 1
    
    trigram_LID = {}
    for trigram, lang_counts in trigram_counts.items():
        trigram_LID[trigram] = {lang: LID(count, info_lang[lang]['NT'], info_lang[lang]['B'], lam) for lang, count in lang_counts.items()}
    
    return trigram_LID

In [58]:
def predict_language(text, trigram_LID, lam=0.5):
    text_preprocessat = preprocess_text(text)  
    trigrams = generate_trigrams(text_preprocessat) 
    
    LID_per_lang = {lang: 0 for lang in info_lang}  
    
    for trigram in trigrams:
        if trigram in trigram_LID:
            LID_per_lang_trigram = trigram_LID[trigram]  
        else:
            LID_per_lang_trigram = {lang: LID(0, info_lang[lang]['NT'], info_lang[lang]['B'], lam) for lang in info_lang}
        for lang, LID_trigram in LID_per_lang_trigram.items():
            LID_per_lang[lang] += LID_trigram 
    max_language = max(LID_per_lang, key=LID_per_lang.get)  
    
    return LID_per_lang, max_language

In [53]:
trigram_LID = entrenament_model(train_split, 0.5)


In [59]:
text = "Hola que tal como estas"

print(predict_language(text, trigram_LID, 0.5))

Ejemplo de trigramas generados para el texto: hola que tal como estas ...
Primeros trigramas generados: ['hol', 'ola', 'la ', 'a q', ' qu', 'que', 'ue ', 'e t', ' ta', 'tal']

Primeros trigramas generados: ['hol', 'ola', 'la ', 'a q', ' qu', 'que', 'ue ', 'e t', ' ta', 'tal', 'al ', 'l c', ' co', 'com', 'omo', 'mo ', 'o e', ' es', 'est', 'sta', 'tas']

({'deu': np.float64(-192.46890045147802), 'eng': np.float64(-171.32050728704098), 'fra': np.float64(-162.66588996274788), 'ita': np.float64(-148.91650573329719), 'nld': np.float64(-187.58083504450917), 'spa': np.float64(-135.4831993369978)}, 'spa')
