In [1]:
import pandas as pd
import numpy as np
import joblib

#Visualización
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

#Métricas
import sklearn as sk
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report, make_scorer

#Configuración Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

#Regressors
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold

#Análisis de Sentimientos
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [2]:
user_stories_train = pd.read_csv('data/train.csv', low_memory=False)
user_stories_test = pd.read_csv('data/test.csv', low_memory=False)

In [3]:
#Verifico balanceo de clases
user_stories_train['storypoint'].value_counts(normalize=True)*100

storypoint
3     23.392405
5     21.430380
1     20.620253
2     16.151899
8     12.974684
4      2.101266
13     1.949367
10     0.405063
20     0.392405
6      0.215190
16     0.088608
12     0.075949
40     0.063291
21     0.037975
15     0.025316
7      0.025316
32     0.012658
34     0.012658
14     0.012658
24     0.012658
Name: proportion, dtype: float64

In [4]:
#Creamos un dataset con features a usar para clasificar
user_stories_x = user_stories_train['description'].copy()
user_stories_test_x = user_stories_test['description'].copy()

#Creamos un dataset con la variable target 'storypoint'
user_stories_y = user_stories_train['storypoint'].copy()

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def lemmatizationAndStopwords(text):
    wordNetLemmatizer = WordNetLemmatizer()
    stopwordSets = set(stopwords.words('english'))
    words = word_tokenize(text)
    wordsFilter = [wordNetLemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwordSets and word.isalpha()]
    return ' '.join(wordsFilter)

In [6]:
#The Best: 2.9470424378844795
# {'tfidf__use_idf': False,
#  'tfidf__sublinear_tf': False,
#  'tfidf__stop_words': ['english'],
#  'tfidf__ngram_range': (1, 2),
#  'tfidf__min_df': 2,
#  'tfidf__max_features': 2000,
#  'tfidf__max_df': 0.8,
#  'tfidf__analyzer': 'word',
#  'svd__random_state': 42,
#  'svd__n_components': 100,
#  'stacking__xgb_regressor__random_state': 42,
#  'stacking__xgb_regressor__objective': 'reg:squarederror',
#  'stacking__xgb_regressor__n_estimators': 100,
#  'stacking__xgb_regressor__max_depth': 3,
#  'stacking__xgb_regressor__learning_rate': 0.01,
#  'stacking__rnd_regressor__random_state': 42,
#  'stacking__rnd_regressor__n_jobs': -1,
#  'stacking__rnd_regressor__n_estimators': 300,
#  'stacking__rnd_regressor__min_samples_split': 2,
#  'stacking__rnd_regressor__min_samples_leaf': 1,
#  'stacking__rnd_regressor__max_features': 'sqrt',
#  'stacking__rnd_regressor__max_depth': 7,
#  'stacking__rnd_regressor__criterion': 'poisson',
#  'stacking__rnd_regressor__bootstrap': True,
#  'stacking__final_estimator__random_state': 42,
#  'stacking__final_estimator__n_estimators': 200,
#  'stacking__final_estimator__max_depth': 4,
#  'stacking__elasticnet__random_state': 42,
#  'stacking__elasticnet__l1_ratio': 0.1,
#  'stacking__elasticnet__alpha': 1.0}

#Creo el modelo y lo entreno
xgb_regressor = XGBRegressor(
                       random_state=42,
                       objective='reg:squarederror',
                       n_estimators=100,
                       max_depth=3,
                       learning_rate=0.01,
                       colsample_bytree=1
                      )

rnd_regressor = RandomForestRegressor(max_features='sqrt',
                             random_state=42,
                             n_jobs=-1,
                             criterion="poisson", 
                             min_samples_leaf=1,
                             min_samples_split=2,
                             max_depth=7,
                             n_estimators=200,
                             bootstrap=True)

elasticnet = ElasticNet(
                    random_state=42,
                    l1_ratio=0.1,
                    alpha=1.0)

svd = TruncatedSVD(
                    random_state=42,
                    n_components=100
                  )

stacking_model = make_pipeline(TfidfVectorizer(
                                        analyzer="word",
                                        stop_words=['english'],
                                        max_features=2000,
                                        ngram_range=(1, 1),
                                        min_df=2,
                                        max_df=0.8,
                                        use_idf=False,
                                        sublinear_tf=False,
                                        preprocessor=lemmatizationAndStopwords), 
                                    svd,
                                    StackingRegressor(
                                        estimators=[
                                            ('xgb_regressor', xgb_regressor),
                                            ('rnd_regressor', rnd_regressor),
                                            ('elasticnet', elasticnet)
                                        ],
                                        final_estimator=GradientBoostingRegressor(
                                                                        random_state=42,
                                                                        n_estimators=200,
                                                                        max_depth=4)
                                    )
                                )

#Entrenamos el modelo
stacking_model = stacking_model.fit(user_stories_x, user_stories_y)

In [7]:
#Realizamos una predicción sobre el set de test
y_pred = stacking_model.predict(user_stories_test_x)
#Valores Predichos
y_pred

array([5.32280993, 2.63482475, 2.5608731 , ..., 2.48607716, 6.79887887,
       3.01796721])

In [8]:
y_pred.shape

(1975,)

###Generamos el dataset de predicción para submitir a kaggle

In [9]:
# genero el dataset id - storypoint:
submition = pd.DataFrame({'id': user_stories_test['id'], 'storypoint': y_pred})
print(submition.shape)
submition

(1975, 2)


Unnamed: 0,id,storypoint
0,3433,5.322810
1,106,2.634825
2,7182,2.560873
3,8985,2.434808
4,2149,2.753377
...,...,...
1970,9069,3.833632
1971,3100,2.671723
1972,6648,2.486077
1973,6076,6.798879


In [10]:
submition.shape

(1975, 2)

In [11]:
submition.to_csv('stacking_submit.csv', index=False)