# Imports

In [4]:
from utils.datautils import *
from utils.MLutils import *
from utils.resources import *
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertModel
from collections import Counter
import random
from sklearn.metrics import classification_report
from data.variables import *

## Busqueda de fuentes

- Fuente 1: Conjunto de preguntas en espa;ol
- Fuente 2: Dataset provisto para Notebook 10
- Fuente 3: Dataset sintetico generado con Gemini
- Fuente 4: Articulos de Wikipedia
- Fuente 5: Subtitulos de peliculas
- Fuente 6: Mixture of preguntas y afirmaciones

In [5]:
questions, question_for_mixture = get_questions()
oraciones_rnn = get_notebook_dataset()
oraciones_sinteticas = get_gemini_dataset()
frases_wikipedia = get_wikipedia_dataset()
esperando_la_carroza, frases_relatos_salvajes = get_pelis_dataset()
mixtures = get_mixture_dataset(oraciones_sinteticas, question_for_mixture)

Se descargaron 5000 preguntas en Español.
Se descargaron 997 oraciones en Español (del dataset del notebook 10).
Hay 1413 oraciones sintéticas.
Se cargaron 6648 frases de Wikipedia.
Se extrajeron 947 frases completas y se guardaron en 'dialogos_esperando_la_carroza.json'
Frases extraídas en total: 947
✅ Se extrajeron 1000 frases de Relatos Salvajes.


## Juntamos las fuentes

In [6]:
oraciones_raw = questions + oraciones_rnn + oraciones_sinteticas + frases_wikipedia + esperando_la_carroza  + frases_relatos_salvajes + mixtures

print('Cantidad total de oraciones:',len(oraciones_raw))
print('Cantidad de oraciones de preguntas:',len(questions))
print('Cantidad de oraciones en espa;ol de hugging face:',len(oraciones_rnn))
print('Cantidad de oraciones sintéticas:',len(oraciones_sinteticas))
print('Cantidad de oraciones de Wikipedia:',len(frases_wikipedia))
print('Cantidad de oraciones de Esperando la carroza:',len(esperando_la_carroza))
print('Cantidad de oraciones de Relatos Salvajes:',len(frases_relatos_salvajes))
print('Cantidad de oraciones Compuestas:',len(mixtures))

print("Algunas oraciones aleatorias:")
random.sample(oraciones_raw, 5)

Cantidad total de oraciones: 20244
Cantidad de oraciones de preguntas: 5000
Cantidad de oraciones en espa;ol de hugging face: 997
Cantidad de oraciones sintéticas: 1413
Cantidad de oraciones de Wikipedia: 6648
Cantidad de oraciones de Esperando la carroza: 947
Cantidad de oraciones de Relatos Salvajes: 1000
Cantidad de oraciones Compuestas: 4239
Algunas oraciones aleatorias:


['Por ejemplo, el hecho de entrar antes o después que un rival, o utilizar una dureza diferente en los neumáticos, puede variar las posiciones de una carrera.',
 'Tras pelear el torneo ante el C.',
 '¿Llevaba dinero?',
 '¿Quién sugirió la incorporación del jugador brasileño según el presidente? ¿Cuál es la principal causa de este retroceso?',
 '¿Por qué se creó Kraft una imagen negativa de Chance Vought?']

## Importamos el modelo

In [7]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [8]:
generate_dataset("¿y vos ¿como?", tokenizer)

[{'word': 'y',
  'token': 193,
  'prev_token': -1,
  'next_token': 63299,
  'has_accent': 0,
  'position': 0.0,
  'starting_punctuation_type': 4,
  'ending_punctuation_type': 0,
  'capitalization_type': 0},
 {'word': 'vos',
  'token': 63299,
  'prev_token': 193,
  'next_token': 10225,
  'has_accent': 0,
  'position': 0.5,
  'starting_punctuation_type': 0,
  'ending_punctuation_type': 0,
  'capitalization_type': 0},
 {'word': 'como',
  'token': 10225,
  'prev_token': 63299,
  'next_token': -1,
  'has_accent': 0,
  'position': 1.0,
  'starting_punctuation_type': 4,
  'ending_punctuation_type': 3,
  'capitalization_type': 0}]

In [9]:
dataset = []
count = 0
for sentence in oraciones_raw:
    if count % 2500 == 0:
      print(f"vamos {count}")
    dataset.append(generate_dataset(sentence, tokenizer))
    count += 1

flattened_dataset = [item for sublist in dataset for item in sublist]

vamos 0
vamos 2500
vamos 5000
vamos 7500
vamos 10000
vamos 12500
vamos 15000
vamos 17500
vamos 20000


In [10]:
X = [[item['token'], item['prev_token'], item['next_token'],  item['has_accent'], item['position']] for item in flattened_dataset]
y_capitalization = [item['capitalization_type'] for item in flattened_dataset]
y_ending_punctuation = [item['ending_punctuation_type'] for item in flattened_dataset]
y_starting_punctuation = [item['starting_punctuation_type'] for item in flattened_dataset]

print("Distribución de clases en y_capitalization:")
print(Counter(y_capitalization))

print("Distribución de clases en y_ending_punctuation:")
print(Counter(y_ending_punctuation))

print("Distribución de clases en y_starting_punctuation:")
print(Counter(y_starting_punctuation))


Distribución de clases en y_capitalization:
Counter({0: 306455, 1: 96921, 3: 4417, 2: 1560})
Distribución de clases en y_ending_punctuation:
Counter({0: 374050, 2: 13626, 3: 11253, 1: 10424})
Distribución de clases en y_starting_punctuation:
Counter({0: 397536, 4: 11729, 2: 82, 3: 3, 1: 3})


In [11]:
undersample_freq = {
    0: 150000,
    1: 40000,
}
X_capitalization, y_capitalization = undersample(X, y_capitalization,
                                                  undersample_freq)

undersample_freq_punctuation = {
    0: 60000,
}
X_ending_punctuation, y_ending_punctuation = undersample(X, y_ending_punctuation,
                                            undersample_freq_punctuation)

undersample_freq_punctuation = {
    0: 200000,
}
X_starting_punctuation, y_starting_punctuation = undersample(X, y_starting_punctuation,
                                            undersample_freq_punctuation)

In [12]:
# ARBOL PARA CAPITALIZACION
from sklearn.ensemble import RandomForestClassifier

X_train_cap, X_test_cap, y_train_cap, y_test_cap = train_test_split(X_capitalization, y_capitalization, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=30)

clf.fit(list(X_train_cap), list(y_train_cap))

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
# ARBOL PARA PUNTUACION INICIAL

X_train_punc_start, X_test_punc_start, y_train_punc_start, y_test_punc_start = train_test_split(X_starting_punctuation, y_starting_punctuation, test_size=0.2, random_state=42)
clf_punc_start = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=30)

clf_punc_start.fit(list(X_train_punc_start), list(y_train_punc_start))

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# ARBOL PARA PUNTUACION FINAL

X_train_punc_end, X_test_punc_end, y_train_punc_end, y_test_punc_end = train_test_split(X_ending_punctuation, y_ending_punctuation, test_size=0.2, random_state=42)
clf_punc_end = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=30)

clf_punc_end.fit(list(X_train_punc_end), list(y_train_punc_end))

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
score = clf.score(X_test_cap, y_test_cap)
print(f"Accuracy cap: {score}")

score = clf_punc_end.score(X_test_punc_end, y_test_punc_end)
print(f"Accuracy end: {score}")

score = clf_punc_start.score(X_test_punc_end, y_test_punc_end)
print(f"Accuracy start: {score}")

Accuracy cap: 0.8738646800693949
Accuracy end: 0.7814385394260532
Accuracy start: 0.6084675515450396


In [16]:

y_pred_cap = clf.predict(X_test_cap)
print("Classification report for capitalization:")
print(classification_report(y_test_cap, y_pred_cap))

y_pred_punc_end = clf_punc_end.predict(X_test_punc_end)
print("Classification report for ending punctuation:")
print(classification_report(y_test_punc_end, y_pred_punc_end))

y_pred_punc_start = clf_punc_start.predict(X_test_punc_start)
print("Classification report for starting punctuation:")
print(classification_report(y_test_punc_start, y_pred_punc_start))

Classification report for capitalization:
              precision    recall  f1-score   support

           0       0.88      0.98      0.92     30065
           1       0.85      0.54      0.66      7884
           2       0.90      0.44      0.60       324
           3       0.85      0.54      0.66       923

    accuracy                           0.87     39196
   macro avg       0.87      0.62      0.71     39196
weighted avg       0.87      0.87      0.86     39196

Classification report for ending punctuation:
              precision    recall  f1-score   support

           0       0.83      0.97      0.90     12018
           1       0.56      0.21      0.31      2050
           2       0.67      0.59      0.63      2684
           3       0.62      0.52      0.57      2309

    accuracy                           0.78     19061
   macro avg       0.67      0.57      0.60     19061
weighted avg       0.76      0.78      0.76     19061

Classification report for starting punctua

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
entrada = "cómo te llamás"
df, salida = random_forest_predict_and_reconstruct(
        clf,
        clf_punc_start,
        clf_punc_end,
        entrada,
        tokenizer,
        verbose=False
)

print(df)
print(salida)

   instancia_id  token_id token  punt_inicial  punt_final  capitalización
0             1     39649  cómo             4           0               1
1             2     10361    te             0           0               0
2             3     22469    ll             0           0               0
3             3     11008  ##am             0           0               0
4             3     12299  ##ás             0           3               0
¿Cómo te llamás?
