In [1]:
import pandas as pd
import numpy as np
import joblib

#Visualización
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

#Métricas
import sklearn as sk
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report, make_scorer

#Configuración Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

#Regressors
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold

#Análisis de Sentimientos
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD

In [2]:
user_stories_train = pd.read_csv('data/train.csv', low_memory=False)

In [3]:
user_stories_train.head()

Unnamed: 0,id,title,description,project,storypoint
0,5660,Error enabling Appcelerator services during ap...,"When creating the default app, I encountered t...",project8,3
1,9014,Create a maintenance branch,"As a developer, I'd like to have a maintenance...",project6,5
2,4094,Service Activity Monitoring Backend integrated...,SAM API used by SAM GUI,project1,5
3,811,fs::enter(rootfs) does not work if 'rootfs' is...,I noticed this when I was testing the unified ...,project5,2
4,4459,transform processor with script option is broken,Creating the following stream throws exception...,project6,2


In [4]:
user_stories_train.dtypes

id              int64
title          object
description    object
project        object
storypoint      int64
dtype: object

In [5]:
# Vemos cantidad de filas y columnas
user_stories_train_size = user_stories_train.shape
print("Cantidad de columnas: %d" % user_stories_train_size[1])
print("Cantidad de filas: %d" % user_stories_train_size[0])

Cantidad de columnas: 5
Cantidad de filas: 7900


In [6]:
# Analizamos si hay filas duplicadas
user_stories_train_total = len(user_stories_train)
user_stories_train_unique = user_stories_train.drop_duplicates()
user_stories_train_unique_size = len(user_stories_train_unique)
print(f'Se eliminaron: {user_stories_train_total - user_stories_train_unique_size} filas duplicadas')

Se eliminaron: 0 filas duplicadas


In [7]:
# Vemos cantidad de datos faltantes
user_stories_train.isna().sum()

id             0
title          0
description    0
project        0
storypoint     0
dtype: int64

In [8]:
#Verifico balanceo de clases
user_stories_train['storypoint'].value_counts(normalize=True)*100

storypoint
3     23.392405
5     21.430380
1     20.620253
2     16.151899
8     12.974684
4      2.101266
13     1.949367
10     0.405063
20     0.392405
6      0.215190
16     0.088608
12     0.075949
40     0.063291
21     0.037975
15     0.025316
7      0.025316
32     0.012658
34     0.012658
14     0.012658
24     0.012658
Name: proportion, dtype: float64

### Preparamos el dataset

In [9]:
#Creamos un dataset con features a usar para clasificar
user_stories_x = user_stories_train['description'].copy()

#Creamos un dataset con la variable target 'storypoint'
user_stories_y = user_stories_train['storypoint'].copy()

#Genero los conjuntos de train y test
x_train, x_test, y_train, y_test = train_test_split(user_stories_x,
                                                   user_stories_y,
                                                   test_size=0.3,  #proporcion 70/30
                                                   random_state=2) #semilla

In [10]:
x_train

2646    The plugin will need to contribute the icon. W...
2724    The timer: component is used to generate messa...
5159    h5.Description:  In CLI we have the ability to...
976     We need to refactor the way how we package and...
2187    Standalone Admin currently has no shiny banner...
                              ...                        
3606    h5. Description:  When launching Studio with C...
5704    When a .tss file contains certain grammars, Ti...
6637    The data that is entering a broadcast stream c...
2575    Provide the infrastructure for HTTP GET /compl...
7336    Studio shows an Empty Preferences window when ...
Name: description, Length: 5530, dtype: object

In [11]:
y_train

2646    8
2724    2
5159    8
976     2
2187    3
       ..
3606    5
5704    2
6637    2
2575    3
7336    3
Name: storypoint, Length: 5530, dtype: int64

## Ensamble con Stacking

###Cross Validation

In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/pablo.prieto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def lemmatizationAndStopwords(text):
    wordNetLemmatizer = WordNetLemmatizer()
    stopwordSets = set(stopwords.words('english'))
    words = word_tokenize(text)
    wordsFilter = [wordNetLemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwordSets and word.isalpha()]
    if not wordsFilter:
        return "empty"
    return ' '.join(wordsFilter)

In [14]:
##KFOLD CV usando Ensamble con Stacking (los mejores atributos, hiperparametros,etc)
from sklearn.pipeline import Pipeline

#The Best: 

# parameters = {
#     "tfidf__analyzer": ["word"],
#     "tfidf__ngram_range": [(1, 1),(1, 2)],
#     "tfidf__use_idf": [True,False],
#     "tfidf__stop_words": [['english']],
#     "tfidf__max_features": [1000, 2000],
#     "tfidf__min_df": [2,5],
#     "tfidf__max_df": [0.75, 0.8],
#     "tfidf__sublinear_tf": [True,False],
#     "svd__n_components": [100, 200, 300, 400, 500],
#     "svd__random_state" :[42],
#     'stacking__xgb_regressor__objective': ['reg:squarederror'],
#     'stacking__xgb_regressor__random_state' :[42], 
#     'stacking__xgb_regressor__n_estimators': [100, 200, 300],
#     'stacking__xgb_regressor__max_depth': [3, 4, 5],
#     'stacking__xgb_regressor__learning_rate': [0.01, 0.1, 0.2],
#     # 'stacking__xgb_regressor__subsample': [0.6, 0.7, 0.8, 0.9, 1],
#     # 'stacking__xgb_regressor__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
#     # 'stacking__xgb_regressor__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
#     # 'stacking__xgb_regressor__reg_alpha': [0, 0.01, 0.1, 1, 10],
#     # 'stacking__xgb_regressor__reg_lambda': [0, 0.01, 0.1, 1, 10]
#     "stacking__rnd_regressor__criterion":["squared_error", "absolute_error", "friedman_mse", "poisson"],
#     "stacking__rnd_regressor__max_features":['sqrt', 'log2'],
#     "stacking__rnd_regressor__max_depth": [7],
#     "stacking__rnd_regressor__random_state":[42], 
#     "stacking__rnd_regressor__n_jobs":[-1],
#     "stacking__rnd_regressor__min_samples_leaf":[1, 2, 5],
#     "stacking__rnd_regressor__min_samples_split":[2,5,10],
#     "stacking__rnd_regressor__n_estimators":[300, 500],
#     "stacking__rnd_regressor__bootstrap": [True],
#     'stacking__lgbm_regressor__random_state' :[42], 
#     'stacking__lgbm_regressor__force_col_wise' :[True],
#     'stacking__lgbm_regressor__n_estimators': [1000],
#     'stacking__lgbm_regressor__learning_rate': [0.01],
#     'stacking__lgbm_regressor__num_leaves': [50],
#     'stacking__lgbm_regressor__max_depth': [10],
#     'stacking__lgbm_regressor__min_child_samples': [10],
#     'stacking__lgbm_regressor__min_child_weight': [0.1],
#     # 'stacking__lgbm_regressor__min_data_in_leaf': [1],
#     'stacking__lgbm_regressor__max_bin': [255],
#     'stacking__lgbm_regressor__subsample': [0.8],
#     'stacking__lgbm_regressor__colsample_bytree': [0.8],
#     'stacking__lgbm_regressor__reg_lambda': [1.0],
#     'stacking__lgbm_regressor__reg_alpha': [0.1],
#     'stacking__final_estimator__random_state' :[42], 
#     'stacking__final_estimator__alpha': [0.1, 1.0, 10.0],
#     'stacking__final_estimator__l1_ratio': [0.1, 0.5, 0.9],
# }

parameters = {
    "tfidf__max_features": [1000, 2000],
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "svd__n_components": [100, 200],
    "stacking__xgb_regressor__n_estimators": [100, 200],
    "stacking__xgb_regressor__max_depth": [3, 5],
    "stacking__rnd_regressor__n_estimators": [300],
    "stacking__rnd_regressor__max_depth": [7],
    "stacking__lgbm_regressor__num_leaves": [50],
    "stacking__lgbm_regressor__learning_rate": [0.01],
    "stacking__final_estimator__alpha": [0.1, 1.0]
}

randomcv_best_score = None

#Iteración Cantidad de splits para el Cross Validation
# for folds in [9,10,12,15,18]:
# for folds in [5,6,7,8,9,10]:
# for folds in [4,5,10]:
for folds in [5,10]:
    
    #Kfold
    kfoldcv = KFold(n_splits=folds, shuffle=True, random_state=42)
    
    #Regressors
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(preprocessor=lemmatizationAndStopwords)),
        ('svd', TruncatedSVD()),
        ('stacking', StackingRegressor(
            estimators=[
                ('xgb_regressor', XGBRegressor()),
                ('rnd_regressor', RandomForestRegressor()),
                ('lgbm_regressor', LGBMRegressor(force_col_wise=True))
            ],
            final_estimator=ElasticNet()
        ))
    ])
    
    #Metrica que quiero optimizar root_mean_squared_error
    scorer_fn = make_scorer(root_mean_squared_error, greater_is_better=False)

    #Random Search Cross Validation
    #Cantidad de combinaciones que quiero probar
    n=10
    randomcv_it = RandomizedSearchCV(estimator = pipeline,
                                     param_distributions = parameters,
                                     scoring = scorer_fn,
                                     cv = kfoldcv,
                                     n_iter = n,
                                     verbose = 2,
                                     n_jobs = 1,
                                     random_state = 42)
    
    # #Grid Search Cross Validation
    # randomcv_it = GridSearchCV(estimator = pipeline,
    #                                  param_grid = parameters,
    #                                  scoring = scorer_fn,
    #                                  cv = kfoldcv) 

    #Busco los hiperparamtros que optimizan F1 Score
    randomcv_it.fit(x_train,y_train);
    
    #Mejores hiperparametros del arbol
    print("folds: ", folds)
    print(randomcv_it.best_params_)
    #Mejor métrica
    print("root_mean_squared_error: ", randomcv_it.best_score_)
    print(" ")
    print("mean_test_score: ", randomcv_it.cv_results_['mean_test_score'])
    
    #Nos quedamos con el menor error root_mean_squared_error
    if randomcv_best_score is None:
        randomcv_best_score = randomcv_it
    elif randomcv_it.best_score_ < randomcv_best_score.best_score_:
        randomcv_best_score = randomcv_it

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 4424, number of used features: 200
[LightGBM] [Info] Start training from score 3.857821
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 3539, number of used features: 200
[LightGBM] [Info] Start training from score 3.885278
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 3539, number of used features: 200
[LightGBM] [Info] Start training from score 3.852218
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 3539, number of used features: 200
[LightGBM] [Info] Start training from score 3.866629
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 3539, number of used features: 200
[LightGBM] [Info] Start training from score 3.839220
[LightGBM] [Info] Total Bins 51000

KeyboardInterrupt: 

In [None]:
randomcv_best_score.best_params_

THE BEST: 3.0575858141040166

folds:  10
{'tfidf__use_idf': True,
 'tfidf__sublinear_tf': True,
 'tfidf__stop_words': ['english'],
 'tfidf__ngram_range': (1, 2),
 'tfidf__min_df': 2,
 'tfidf__max_features': 2000,
 'tfidf__max_df': 0.75,
 'tfidf__analyzer': 'word',
 'svd__random_state': 42,
 'svd__n_components': 400,
 'stacking__xgb_regressor__random_state': 42,
 'stacking__xgb_regressor__objective': 'reg:squarederror',
 'stacking__xgb_regressor__n_estimators': 300,
 'stacking__xgb_regressor__max_depth': 3,
 'stacking__xgb_regressor__learning_rate': 0.01,
 'stacking__rnd_regressor__random_state': 42,
 'stacking__rnd_regressor__n_jobs': -1,
 'stacking__rnd_regressor__n_estimators': 300,
 'stacking__rnd_regressor__min_samples_split': 2,
 'stacking__rnd_regressor__min_samples_leaf': 1,
 'stacking__rnd_regressor__max_features': 'sqrt',
 'stacking__rnd_regressor__max_depth': 7,
 'stacking__rnd_regressor__criterion': 'squared_error',
 'stacking__rnd_regressor__bootstrap': True,
 'stacking__lgbm_regressor__subsample': 0.8,
 'stacking__lgbm_regressor__random_state': 42,
 'stacking__lgbm_regressor__num_leaves': 50,
 'stacking__lgbm_regressor__n_estimators': 100,
 'stacking__lgbm_regressor__min_child_samples': 5,
 'stacking__lgbm_regressor__max_depth': 3,
 'stacking__lgbm_regressor__learning_rate': 0.2,
 'stacking__lgbm_regressor__colsample_bytree': 0.7,
 'stacking__final_estimator__random_state': 42,
 'stacking__final_estimator__l1_ratio': 0.5,
 'stacking__final_estimator__alpha': 10.0}

In [None]:
randomcv_best_score.best_score_

In [None]:
randomcv_best_score.cv_results_['mean_test_score']

THE BEST

