In [1]:
import pandas as pd
import numpy as np
import joblib

#Visualización
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

#Métricas
import sklearn as sk
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report, make_scorer

#Configuración Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

#Regressors
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasRegressor

from keras import backend as K
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold

#Análisis de Sentimientos
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

2024-12-01 22:04:52.813757: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 22:04:52.817030: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 22:04:52.827110: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733101492.843978 1159276 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733101492.849083 1159276 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 22:04:52.866000: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [2]:
user_stories_train = pd.read_csv('data/train.csv', low_memory=False)

In [3]:
user_stories_train.head()

Unnamed: 0,id,title,description,project,storypoint
0,5660,Error enabling Appcelerator services during ap...,"When creating the default app, I encountered t...",project8,3
1,9014,Create a maintenance branch,"As a developer, I'd like to have a maintenance...",project6,5
2,4094,Service Activity Monitoring Backend integrated...,SAM API used by SAM GUI,project1,5
3,811,fs::enter(rootfs) does not work if 'rootfs' is...,I noticed this when I was testing the unified ...,project5,2
4,4459,transform processor with script option is broken,Creating the following stream throws exception...,project6,2


In [4]:
user_stories_train.dtypes

id              int64
title          object
description    object
project        object
storypoint      int64
dtype: object

In [5]:
# Vemos cantidad de filas y columnas
user_stories_train_size = user_stories_train.shape
print("Cantidad de columnas: %d" % user_stories_train_size[1])
print("Cantidad de filas: %d" % user_stories_train_size[0])

Cantidad de columnas: 5
Cantidad de filas: 7900


In [6]:
# Analizamos si hay filas duplicadas
user_stories_train_total = len(user_stories_train)
user_stories_train_unique = user_stories_train.drop_duplicates()
user_stories_train_unique_size = len(user_stories_train_unique)
print(f'Se eliminaron: {user_stories_train_total - user_stories_train_unique_size} filas duplicadas')

Se eliminaron: 0 filas duplicadas


In [7]:
# Vemos cantidad de datos faltantes
user_stories_train.isna().sum()

id             0
title          0
description    0
project        0
storypoint     0
dtype: int64

In [8]:
#Verifico balanceo de clases
user_stories_train['storypoint'].value_counts(normalize=True)*100

storypoint
3     23.392405
5     21.430380
1     20.620253
2     16.151899
8     12.974684
4      2.101266
13     1.949367
10     0.405063
20     0.392405
6      0.215190
16     0.088608
12     0.075949
40     0.063291
21     0.037975
15     0.025316
7      0.025316
32     0.012658
34     0.012658
14     0.012658
24     0.012658
Name: proportion, dtype: float64

### Preparamos el dataset

In [9]:
#Creamos un dataset con features a usar para clasificar
user_stories_x = user_stories_train['description'].copy()

#Creamos un dataset con la variable target 'storypoint'
user_stories_y = user_stories_train['storypoint'].copy()

#Genero los conjuntos de train y test
x_train, x_test, y_train, y_test = train_test_split(user_stories_x,
                                                   user_stories_y,
                                                   test_size=0.3,  #proporcion 70/30
                                                   random_state=2) #semilla

In [10]:
x_train

2646    The plugin will need to contribute the icon. W...
2724    The timer: component is used to generate messa...
5159    h5.Description:  In CLI we have the ability to...
976     We need to refactor the way how we package and...
2187    Standalone Admin currently has no shiny banner...
                              ...                        
3606    h5. Description:  When launching Studio with C...
5704    When a .tss file contains certain grammars, Ti...
6637    The data that is entering a broadcast stream c...
2575    Provide the infrastructure for HTTP GET /compl...
7336    Studio shows an Empty Preferences window when ...
Name: description, Length: 5530, dtype: object

In [11]:
y_train

2646    8
2724    2
5159    8
976     2
2187    3
       ..
3606    5
5704    2
6637    2
2575    3
7336    3
Name: storypoint, Length: 5530, dtype: int64

## TensorFlow y Keras

###Cross Validation

In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def lemmatizationAndStopwords(text):
    wordNetLemmatizer = WordNetLemmatizer()
    stopwordSets = set(stopwords.words('english'))
    words = word_tokenize(text)
    wordsFilter = [wordNetLemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwordSets and word.isalpha()]
    if not wordsFilter:
        return "empty"
    return ' '.join(wordsFilter)

In [14]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitorea la pérdida en el conjunto de validación
    patience=5,          # Detiene si no mejora en 5 épocas
    restore_best_weights=True  # Restaura los mejores pesos encontrados
)

In [15]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2

def build_model(input_dim, learning_rate=0.001, optimizer_name="Adam", callbacks=[early_stopping]):
    if optimizer_name == "Adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "Adamax":
        optimizer = tf.keras.optimizers.Adamax(learning_rate=learning_rate)
    elif optimizer_name == "RMSprop":
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer_name == "Nadam":
        optimizer = tf.keras.optimizers.Nadam(learning_rate=learning_rate)
    elif optimizer_name == "SGD":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    else:
        raise ValueError(f"Parametrizar optimizer: {optimizer_name}")

    # Definición del modelo
    model = Sequential([
        Dense(128, activation="relu", input_dim=input_dim),
        BatchNormalization(),  # Normalización para estabilidad
        Dropout(0.3), # Reduce el sobreajuste al apagar aleatoriamente el 30% de las unidades.
        Dense(64, activation="elu", kernel_regularizer=l2(0.01)), # Regularización L2: Penaliza pesos grandes, ayudando a evitar sobreajuste.
        BatchNormalization(),  # Normalización para estabilidad
        Dropout(0.4), # Reduce el sobreajuste al apagar aleatoriamente el 40% de las unidades.
        Dense(32, activation="relu"),
        Dense(1, activation="linear")  # Salida de regresión
    ])
    
    model.compile(
        optimizer=optimizer,
        loss='mean_squared_error',
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )
    
    return model
    

In [16]:
keras_regressor = KerasRegressor(
    model=build_model,
    input_dim=100,
    learning_rate=0.001,
    optimizer_name="Adam",
    epochs=50,
    batch_size=32,
    verbose=0,
    callbacks=[early_stopping],
    warm_start=True # no recompilar innecesariamente trazas redundantes.
)

print(keras_regressor.get_params().keys())

dict_keys(['model', 'build_fn', 'warm_start', 'random_state', 'optimizer', 'loss', 'metrics', 'batch_size', 'validation_batch_size', 'verbose', 'callbacks', 'validation_split', 'shuffle', 'run_eagerly', 'epochs', 'input_dim', 'learning_rate', 'optimizer_name'])


In [17]:
##KFOLD CV usando TensorFlow y Keras (los mejores atributos, hiperparametros,etc)
from sklearn.pipeline import Pipeline

parameters = {
    "tfidf__analyzer": ["word"],
    "tfidf__ngram_range": [(1, 1),(1, 2),(1, 3)],
    "tfidf__use_idf": [True,False],
    "tfidf__stop_words": [['english']],
    "tfidf__max_features": [1000],
    "tfidf__min_df": [2,3,5],
    "tfidf__max_df": [0.75, 0.8],
    "tfidf__sublinear_tf": [True,False],
    "svd__n_components": [100],
    "svd__random_state" :[42],
    "keras_regressor__learning_rate": [0.01, 0.001, 0.0005],
    "keras_regressor__batch_size": [16, 32, 64, 128, 256, 512],
    "keras_regressor__epochs": [10, 20, 30, 50],
    "keras_regressor__optimizer_name": ["Adam", "Adamax", "RMSprop", "Nadam", "SGD"]
}

randomcv_best_score = None

#Iteración Cantidad de splits para el Cross Validation
# for folds in [10,11,12,13,14,15,16,17,18,20]:
# for folds in [6,7,8,9,10,11,12]:
for folds in [5,10]:
    
    #Kfold
    kfoldcv = KFold(n_splits=folds, shuffle=True, random_state=42)

    #Regressors
    pipeline = Pipeline(
        steps=[
            ("tfidf", TfidfVectorizer(preprocessor=lemmatizationAndStopwords)),
            ('svd', TruncatedSVD()),
            ('scaler', StandardScaler()),
            ("keras_regressor", keras_regressor),
        ]
    )
    
    #Metrica que quiero optimizar root_mean_squared_error
    scorer_fn = make_scorer(root_mean_squared_error)

    #Random Search Cross Validation
    #Cantidad de combinaciones que quiero probar
    n=20
    randomcv_it = RandomizedSearchCV(estimator = pipeline,
                                     param_distributions = parameters,
                                     scoring = scorer_fn,
                                     cv = kfoldcv,
                                     n_iter = n,
                                     verbose=1, 
                                     n_jobs=1)

    # #Grid Search Cross Validation
    # randomcv_it = GridSearchCV(estimator = pipeline,
    #                                  param_grid = parameters,
    #                                  scoring = scorer_fn,
    #                                  cv = kfoldcv) 

    #TensorFlow prefiere trabajar con tensores o matrices NumPy
    x_train = x_train.to_numpy() if hasattr(x_train, "to_numpy") else x_train
    y_train = y_train.to_numpy() if hasattr(y_train, "to_numpy") else y_train

    
    #Busco los hiperparamtros que optimizan F1 Score
    randomcv_it.fit(x_train,y_train);
    
    #Mejores hiperparametros del arbol
    print("folds: ", folds)
    print(randomcv_it.best_params_)
    #Mejor métrica
    print("root_mean_squared_error: ", randomcv_it.best_score_)
    print(" ")
    print("mean_test_score: ", randomcv_it.cv_results_['mean_test_score'])
    
    #Nos quedamos con el menor error root_mean_squared_error
    if randomcv_best_score is None:
        randomcv_best_score = randomcv_it
    elif randomcv_it.best_score_ < randomcv_best_score.best_score_:
        randomcv_best_score = randomcv_it

Fitting 5 folds for each of 20 candidates, totalling 100 fits


2024-12-01 22:05:01.641048: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


folds:  5
{'tfidf__use_idf': False, 'tfidf__sublinear_tf': False, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 2, 'tfidf__max_features': 1000, 'tfidf__max_df': 0.75, 'tfidf__analyzer': 'word', 'svd__random_state': 42, 'svd__n_components': 100, 'keras_regressor__optimizer_name': 'SGD', 'keras_regressor__learning_rate': 0.01, 'keras_regressor__epochs': 20, 'keras_regressor__batch_size': 512}
root_mean_squared_error:  3.26281727952131
 
mean_test_score:  [2.92190262 2.95565237 2.86776445 2.84863755 2.89862098 2.95989368
 2.96029748 3.26281728 2.87582244 2.87783348 2.9191342  2.8711147
 2.8639707  3.1004485  2.90644675 3.11308121 2.92844907 2.91691988
 2.99168075 2.93319896]
Fitting 10 folds for each of 20 candidates, totalling 200 fits
folds:  10
{'tfidf__use_idf': False, 'tfidf__sublinear_tf': True, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 5, 'tfidf__max_features': 1000, 'tfidf__max_df': 0.75, 'tfidf__analyzer': 

In [18]:
randomcv_best_score.best_params_

{'tfidf__use_idf': False,
 'tfidf__sublinear_tf': False,
 'tfidf__stop_words': ['english'],
 'tfidf__ngram_range': (1, 2),
 'tfidf__min_df': 2,
 'tfidf__max_features': 1000,
 'tfidf__max_df': 0.75,
 'tfidf__analyzer': 'word',
 'svd__random_state': 42,
 'svd__n_components': 100,
 'keras_regressor__optimizer_name': 'SGD',
 'keras_regressor__learning_rate': 0.01,
 'keras_regressor__epochs': 20,
 'keras_regressor__batch_size': 512}

RANDOM SEARCH CV

folds:  5
{'tfidf__use_idf': True, 'tfidf__sublinear_tf': True, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 3, 'tfidf__max_features': 3000, 'tfidf__max_df': 0.75, 'tfidf__analyzer': 'word', 'svd__random_state': 42, 'svd__n_components': 100, 'keras_regressor__optimizer_name': 'SGD', 'keras_regressor__learning_rate': 0.01, 'keras_regressor__epochs': 10, 'keras_regressor__batch_size': 256}

root_mean_squared_error:  3.0674813522442883


folds:  7
{'tfidf__use_idf': True, 'tfidf__sublinear_tf': True, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 5, 'tfidf__max_features': 1000, 'tfidf__max_df': 0.8, 'tfidf__analyzer': 'word', 'svd__random_state': 42, 'svd__n_components': 100, 'keras_regressor__optimizer_name': 'Nadam', 'keras_regressor__learning_rate': 0.01, 'keras_regressor__epochs': 20, 'keras_regressor__batch_size': 512}

root_mean_squared_error:  3.1816264999054167


folds:  10
{'tfidf__use_idf': False, 'tfidf__sublinear_tf': True, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 5, 'tfidf__max_features': 1000, 'tfidf__max_df': 0.75, 'tfidf__analyzer': 'word', 'svd__random_state': 42, 'svd__n_components': 100, 'keras_regressor__optimizer_name': 'Adamax', 'keras_regressor__learning_rate': 0.0005, 'keras_regressor__epochs': 20, 'keras_regressor__batch_size': 512}

root_mean_squared_error:  3.357244104258018





In [19]:
randomcv_best_score.best_score_

np.float64(3.26281727952131)

In [20]:
randomcv_best_score.cv_results_['mean_test_score']

array([2.92190262, 2.95565237, 2.86776445, 2.84863755, 2.89862098,
       2.95989368, 2.96029748, 3.26281728, 2.87582244, 2.87783348,
       2.9191342 , 2.8711147 , 2.8639707 , 3.1004485 , 2.90644675,
       3.11308121, 2.92844907, 2.91691988, 2.99168075, 2.93319896])

THE BEST: 

{'tfidf__use_idf': True, 'tfidf__sublinear_tf': True, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 3, 'tfidf__max_features': 3000, 'tfidf__max_df': 0.75, 'tfidf__analyzer': 'word', 'svd__random_state': 42, 'svd__n_components': 100, 'keras_regressor__optimizer_name': 'SGD', 'keras_regressor__learning_rate': 0.01, 'keras_regressor__epochs': 10, 'keras_regressor__batch_size': 256}

root_mean_squared_error: 3.0674813522442883