In [1]:
import pandas as pd
import numpy as np
import joblib

#Visualización
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

#Métricas
import sklearn as sk
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report, make_scorer

#Configuración Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

#Regressors
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasRegressor

from keras import backend as K
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold

#Análisis de Sentimientos
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

2024-12-01 22:43:02.892383: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 22:43:02.896302: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 22:43:02.908437: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733103782.929137 1247542 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733103782.935283 1247542 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 22:43:02.955196: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [2]:
user_stories_train = pd.read_csv('data/train.csv', low_memory=False)
user_stories_test = pd.read_csv('data/test.csv', low_memory=False)

In [3]:
#Verifico balanceo de clases
user_stories_train['storypoint'].value_counts(normalize=True)*100

storypoint
3     23.392405
5     21.430380
1     20.620253
2     16.151899
8     12.974684
4      2.101266
13     1.949367
10     0.405063
20     0.392405
6      0.215190
16     0.088608
12     0.075949
40     0.063291
21     0.037975
15     0.025316
7      0.025316
32     0.012658
34     0.012658
14     0.012658
24     0.012658
Name: proportion, dtype: float64

In [4]:
#Creamos un dataset con features a usar para clasificar
user_stories_x = user_stories_train['description'].copy()
user_stories_test_x = user_stories_test['description'].copy()

#Creamos un dataset con la variable target 'storypoint'
user_stories_y = user_stories_train['storypoint'].copy()

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def lemmatizationAndStopwords(text):
    wordNetLemmatizer = WordNetLemmatizer()
    stopwordSets = set(stopwords.words('english'))
    words = word_tokenize(text)
    wordsFilter = [wordNetLemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwordSets and word.isalpha()]
    if not wordsFilter:
        return "empty"
    return ' '.join(wordsFilter)

In [7]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitorea la pérdida en el conjunto de validación
    patience=5,          # Detiene si no mejora en 5 épocas
    restore_best_weights=True  # Restaura los mejores pesos encontrados
)

In [8]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2

def build_model(input_dim, learning_rate=0.01, optimizer_name="SGD", callbacks=[early_stopping]):
    if optimizer_name == "Adam":
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "Adamax":
        optimizer = tf.keras.optimizers.Adamax(learning_rate=learning_rate)
    elif optimizer_name == "RMSprop":
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer_name == "Nadam":
        optimizer = tf.keras.optimizers.Nadam(learning_rate=learning_rate)
    elif optimizer_name == "SGD":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    else:
        raise ValueError(f"Parametrizar optimizer: {optimizer_name}")

    # Definición del modelo
    model = Sequential([
        Dense(128, activation="relu", input_dim=input_dim),
        BatchNormalization(),  # Normalización para estabilidad
        Dropout(0.3), # Reduce el sobreajuste al apagar aleatoriamente el 30% de las unidades.
        Dense(64, activation="elu", kernel_regularizer=l2(0.01)), # Regularización L2: Penaliza pesos grandes, ayudando a evitar sobreajuste.
        BatchNormalization(),  # Normalización para estabilidad
        Dropout(0.4), # Reduce el sobreajuste al apagar aleatoriamente el 40% de las unidades.
        Dense(32, activation="relu"),
        Dense(1, activation="linear")  # Salida de regresión
    ])
    
    model.compile(
        optimizer=optimizer,
        loss='mean_squared_error',
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )
    
    return model

In [9]:
#The Best train:  3.0674813522442883
# {'tfidf__use_idf': True, 'tfidf__sublinear_tf': True, 'tfidf__stop_words': ['english'], 
#           'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 3, 'tfidf__max_features': 3000, 'tfidf__max_df': 0.75, 
#           'tfidf__analyzer': 'word', 'svd__random_state': 42, 'svd__n_components': 100, 'keras_regressor__optimizer_name': 'SGD', 
#           'keras_regressor__learning_rate': 0.01, 'keras_regressor__epochs': 10, 'keras_regressor__batch_size': 256}


#Creo el modelo y lo entreno
keras_regressor = KerasRegressor(
                    model=build_model,
                    input_dim=100,
                    learning_rate=0.01,
                    optimizer_name="SGD",
                    epochs=10,
                    batch_size=256,
                    verbose=0,
                    callbacks=[early_stopping]
                )

svd = TruncatedSVD(
                    random_state=42,
                    n_components=100
                  )

tensorFlow_keras_model = make_pipeline(TfidfVectorizer(
                            analyzer="word",
                            stop_words=['english'],
                            max_features=3000,
                            ngram_range=(1, 2),
                            min_df=3,
                            max_df=0.75,
                            use_idf=True,
                            sublinear_tf=True,
                        preprocessor=lemmatizationAndStopwords), svd, StandardScaler() ,keras_regressor)

#Entrenamos el modelo
#TensorFlow prefiere trabajar con tensores o matrices NumPy
user_stories_x = user_stories_x.to_numpy() if hasattr(user_stories_x, "to_numpy") else user_stories_x
user_stories_y = user_stories_y.to_numpy() if hasattr(user_stories_y, "to_numpy") else user_stories_y
tensorFlow_keras_model = tensorFlow_keras_model.fit(user_stories_x, user_stories_y)

2024-12-01 22:43:21.415482: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [10]:
#Realizamos una predicción sobre el set de test
y_pred = tensorFlow_keras_model.predict(user_stories_test_x)
#Valores Predichos
y_pred

array([5.2147484, 2.3463418, 2.3420427, ..., 2.3420427, 8.009494 ,
       2.3420427], dtype=float32)

In [11]:
y_pred.shape

(1975,)

###Generamos el dataset de predicción para submitir a kaggle

In [12]:
# genero el dataset id - storypoint:
submition = pd.DataFrame({'id': user_stories_test['id'], 'storypoint': y_pred})
print(submition.shape)
submition

(1975, 2)


Unnamed: 0,id,storypoint
0,3433,5.214748
1,106,2.346342
2,7182,2.342043
3,8985,3.569724
4,2149,2.342043
...,...,...
1970,9069,3.408068
1971,3100,2.595372
1972,6648,2.342043
1973,6076,8.009494


In [13]:
submition.shape

(1975, 2)

In [14]:
submition.to_csv('tensorFlow_keras_2_submit.csv', index=False)