In [1]:
import pandas as pd
import numpy as np
import joblib

#Visualización
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

#Métricas
import sklearn as sk
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report, make_scorer

#Configuración Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

#Regressors
from xgboost import XGBRegressor

from keras import backend as K
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold

#Análisis de Sentimientos
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

2024-11-12 18:03:47.623080: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-12 18:03:47.626179: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-12 18:03:47.636167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731445427.653091   14365 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731445427.658081   14365 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-12 18:03:47.674639: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [2]:
user_stories_train = pd.read_csv('data/train.csv', low_memory=False)
user_stories_test = pd.read_csv('data/test.csv', low_memory=False)

In [3]:
#Verifico balanceo de clases
user_stories_train['storypoint'].value_counts(normalize=True)*100

storypoint
3     23.392405
5     21.430380
1     20.620253
2     16.151899
8     12.974684
4      2.101266
13     1.949367
10     0.405063
20     0.392405
6      0.215190
16     0.088608
12     0.075949
40     0.063291
21     0.037975
15     0.025316
7      0.025316
32     0.012658
34     0.012658
14     0.012658
24     0.012658
Name: proportion, dtype: float64

In [4]:
#Creamos un dataset con features a usar para clasificar
user_stories_x = user_stories_train['description'].copy()
user_stories_test_x = user_stories_test['description'].copy()

#Creamos un dataset con la variable target 'storypoint'
user_stories_y = user_stories_train['storypoint'].copy()

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def lemmatizationAndStopwords(text):
    wordNetLemmatizer = WordNetLemmatizer()
    stopwordSets = set(stopwords.words('english'))
    words = word_tokenize(text)
    wordsFilter = [wordNetLemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwordSets and word.isalpha()]
    return ' '.join(wordsFilter)

In [6]:
#The Best train: 3.0159159380511493
# {'xgb_regressor__subsample': 0.7, 'xgb_regressor__reg_lambda': 0.1, 'xgb_regressor__reg_alpha': 0, 'xgb_regressor__random_state': 42, 
#  'xgb_regressor__objective': 'reg:squarederror', 'xgb_regressor__n_estimators': 500, 'xgb_regressor__max_depth': 10, 
#  'xgb_regressor__learning_rate': 0.3, 'xgb_regressor__gamma': 0.1, 'xgb_regressor__colsample_bytree': 1, 'tfidf__use_idf': False, 
#  'tfidf__sublinear_tf': True, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 5, 
#  'tfidf__max_features': 2000, 'tfidf__max_df': 0.8, 'tfidf__analyzer': 'word'} 

#CON SVD: 3.0462914103801038
# {'xgb_regressor__subsample': 0.9, 'xgb_regressor__reg_lambda': 1, 'xgb_regressor__reg_alpha': 1, 'xgb_regressor__random_state': 42, 
#  'xgb_regressor__objective': 'reg:squarederror', 'xgb_regressor__n_estimators': 500, 'xgb_regressor__max_depth': 20, 
#  'xgb_regressor__learning_rate': 0.3, 'xgb_regressor__gamma': 0.5, 'xgb_regressor__colsample_bytree': 1, 'tfidf__use_idf': True, 
#  'tfidf__sublinear_tf': False, 'tfidf__stop_words': ['english'], 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 2, 
#  'tfidf__max_features': 1000, 'tfidf__max_df': 0.75, 'tfidf__analyzer': 'word', 'svd__random_state': 42, 'svd__n_components': 300} 


#Creo el modelo y lo entreno
xgb_regressor = XGBRegressor(
                       # subsample=0.7,
                       subsample=0.9,
                       # reg_lambda=0.1,
                       reg_lambda=1,
                       # reg_alpha=0,
                        reg_alpha=1,
                       random_state=42,
                       objective='reg:squarederror',
                       n_estimators=500,
                       # max_depth=10,
                       max_depth=20,
                       learning_rate=0.3,
                       # gamma=0.1,
                       gamma=0.5,
                       colsample_bytree=1
                      )

svd = TruncatedSVD(
                    random_state=42,
                    n_components=300
                  )

xgb_model = make_pipeline(TfidfVectorizer(
                            analyzer="word",
                            stop_words=['english'],
                            # max_features=2000,
                            max_features=1000,
                            ngram_range=(1, 1),
                            # min_df=5,
                            min_df=2,
                            # max_df=0.8,
                            max_df=0.75,
                            # use_idf=False,
                            use_idf=True,
                            sublinear_tf=True,
                        preprocessor=lemmatizationAndStopwords), svd, xgb_regressor)

#Entrenamos el modelo
xgb_model = xgb_model.fit(user_stories_x, user_stories_y)

In [7]:
#Realizamos una predicción sobre el set de test
y_pred = xgb_model.predict(user_stories_test_x)
#Valores Predichos
y_pred

array([4.7150974, 1.5404733, 2.338317 , ..., 2.4748843, 5.4455504,
       4.9089794], dtype=float32)

In [8]:
y_pred.shape

(1975,)

###Generamos el dataset de predicción para submitir a kaggle

In [9]:
# genero el dataset id - storypoint:
submition = pd.DataFrame({'id': user_stories_test['id'], 'storypoint': y_pred})
print(submition.shape)
submition

(1975, 2)


Unnamed: 0,id,storypoint
0,3433,4.715097
1,106,1.540473
2,7182,2.338317
3,8985,2.027675
4,2149,3.869226
...,...,...
1970,9069,4.751425
1971,3100,4.067661
1972,6648,2.474884
1973,6076,5.445550


In [10]:
submition.shape

(1975, 2)

In [11]:
submition.to_csv('xgb_lemm_sw_svd_submit.csv', index=False)