In [1]:
import pandas as pd
import numpy as np
import joblib

#Visualización
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

#Métricas
import sklearn as sk
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report, make_scorer

#Configuración Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

#Regressors
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasRegressor

from keras import backend as K

#Análisis de Sentimientos
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

2024-11-29 15:06:30.479942: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-29 15:06:30.483033: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-29 15:06:30.492681: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732903590.509133  489515 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732903590.514082  489515 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-29 15:06:30.530401: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [2]:
user_stories_train = pd.read_csv('data/train.csv', low_memory=False)
user_stories_test = pd.read_csv('data/test.csv', low_memory=False)

In [3]:
#Verifico balanceo de clases
user_stories_train['storypoint'].value_counts(normalize=True)*100

storypoint
3     23.392405
5     21.430380
1     20.620253
2     16.151899
8     12.974684
4      2.101266
13     1.949367
10     0.405063
20     0.392405
6      0.215190
16     0.088608
12     0.075949
40     0.063291
21     0.037975
15     0.025316
7      0.025316
32     0.012658
34     0.012658
14     0.012658
24     0.012658
Name: proportion, dtype: float64

In [4]:
#Creamos un dataset con features a usar para clasificar
user_stories_x = user_stories_train['description'].copy()
user_stories_test_x = user_stories_test['description'].copy()

#Creamos un dataset con la variable target 'storypoint'
user_stories_y = user_stories_train['storypoint'].copy()

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def lemmatizationAndStopwords(text):
    wordNetLemmatizer = WordNetLemmatizer()
    stopwordSets = set(stopwords.words('english'))
    words = word_tokenize(text)
    wordsFilter = [wordNetLemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwordSets and word.isalpha()]
    if not wordsFilter:
        return "empty"
    return ' '.join(wordsFilter)

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

max_tokens = 20000
sequence_length = 100

def build_model(input_dim, dropout_rate, dense_units, learning_rate):
    model = Sequential([
        Dense(128, activation="relu", input_dim=input_dim),
        Dropout(dropout_rate),
        Dense(dense_units, activation="relu"),
        Dense(1, activation="linear")
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), 
                  loss='mean_squared_error', 
                  metrics=['RootMeanSquaredError'])
    return model
    

In [8]:
#The Best train: 2.8201966753545467 
 # {'tfidf__use_idf': False, 'tfidf__sublinear_tf': False, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 2), 
 #  'tfidf__min_df': 2, 'tfidf__max_features': 1000, 'tfidf__max_df': 0.8, 'tfidf__analyzer': 'word', 'svd__random_state': 42, 
 #  'svd__n_components': 100, 'keras_regressor__learning_rate': 0.001, 'keras_regressor__epochs': 10,
 #  'keras_regressor__dropout_rate': 0.4, 'keras_regressor__dense_units': 32, 'keras_regressor__batch_size': 16}


#Creo el modelo y lo entreno
keras_regressor = KerasRegressor(
                    model=build_model,
                    input_dim=100,
                    dropout_rate=0.4,
                    dense_units=32,
                    learning_rate=0.001,
                    epochs=10,
                    batch_size=16,
                    verbose=0
                )

svd = TruncatedSVD(
                    random_state=42,
                    n_components=100
                  )

tensorFlow_keras_model = make_pipeline(TfidfVectorizer(
                            analyzer="word",
                            stop_words=['english'],
                            max_features=1000,
                            ngram_range=(1, 2),
                            min_df=2,
                            max_df=0.8,
                            use_idf=False,
                            sublinear_tf=False,
                        preprocessor=lemmatizationAndStopwords), svd, keras_regressor)

#Entrenamos el modelo
tensorFlow_keras_model = tensorFlow_keras_model.fit(user_stories_x, user_stories_y)

2024-11-29 15:06:41.812496: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [9]:
#Realizamos una predicción sobre el set de test
y_pred = tensorFlow_keras_model.predict(user_stories_test_x)
#Valores Predichos
y_pred

array([5.9444466, 2.0867465, 1.8616592, ..., 2.4857378, 7.7052927,
       2.862949 ], dtype=float32)

In [10]:
y_pred.shape

(1975,)

###Generamos el dataset de predicción para submitir a kaggle

In [11]:
# genero el dataset id - storypoint:
submition = pd.DataFrame({'id': user_stories_test['id'], 'storypoint': y_pred})
print(submition.shape)
submition

(1975, 2)


Unnamed: 0,id,storypoint
0,3433,5.944447
1,106,2.086746
2,7182,1.861659
3,8985,1.681114
4,2149,1.852773
...,...,...
1970,9069,3.386106
1971,3100,1.671859
1972,6648,2.485738
1973,6076,7.705293


In [12]:
submition.shape

(1975, 2)

In [13]:
submition.to_csv('tensorFlow_keras_submit.csv', index=False)