# 7506 - Organizacion de Datos - TP N°2

#  Gradient Boosting Classifier

# 1. Librerias


In [1]:
#Paquetes Clasicos
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Paquetes para Preprocesamiento
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 


#Vectorizacion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#Pipelines
from sklearn.pipeline import Pipeline

#Tuning de Parametros
from sklearn.model_selection import GridSearchCV

#Clasificador
from sklearn.ensemble import GradientBoostingClassifier # Classifier using Gradient Boosting

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Datos

In [2]:
#Dataset de entranamiento
df_train = pd.read_csv("train.csv")

#Dataset de test
df_test = pd.read_csv("test.csv")

## 2.1 Dataset 'train'

In [3]:
#Observo la constitucion general del Dataset de entrenamiento

df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_train.shape

(7613, 5)

In [5]:
df_train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

## 2.2 Dataset 'test'

In [6]:
#Observo la constitucion general del Dataset de test

df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
df_test.shape

(3263, 4)

# 3. Preprocesamiento

Se definen funciones especificas para realizar el preprocesamiento de los datos:

In [8]:
# Transformar el texto a minuscula

def minuscula(texto):
    return texto.lower()

In [9]:
#Remover URL

def remover_url(texto):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', texto)

In [10]:
#Remover Usuarios que aparecen (@usuario)

def remover_usuario(text):
    text = re.sub(r"\@[A-Za-z0-9]+", "", text)
    return text

In [11]:
# Remover Emoji

def remover_emoji(texto):
    emoji_patrones = re.compile(
        '['
        u'\U0001F600-\U0001F64F' 
        u'\U0001F300-\U0001F5FF' 
        u'\U0001F680-\U0001F6FF' 
        u'\U0001F1E0-\U0001F1FF' 
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_patrones.sub(r'', texto)

In [12]:
#Expandir las Abreviaturas

abreviaturas = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", 
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

def expandir_abreviatura(texto,mapping = abreviaturas):
    texto = ' '.join([mapping[t] if t in mapping else t for t in texto.split(" ")])
    return texto

In [13]:
#Expandir las Contracciones

contracciones_mapeo = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", 
                       "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", 
                       "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", 
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
                       "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
                       "mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", 
                       "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have",
                       "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", 
                       "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", 
                       "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                       "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", 
                       "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", 
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", 
                       "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", 
                       "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", 
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def expandir_contraccion(texto,mapping = contracciones_mapeo):
    specials =["’", "‘", "´", "`"]
    for s in specials:
        texto = texto.replace(s,"'")
    
    texto = ' '.join([mapping[t] if t in mapping else t for t in texto.split(" ")])
    return texto

In [14]:
#Remover Caracteres Especiales

def remover_caracter_especial(texto):
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', texto)

In [15]:
# Remover los tags HTML

def remover_tag_html(texto):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', texto)

In [16]:
#Remover acentos

def remover_acento(texto):
    import unicodedata
    texto = unicodedata.normalize('NFKD', texto).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return texto

In [17]:
#Remover Puntos

def remover_punto(texto):
    import string
    texto = ''.join([c for c in texto if c not in string.punctuation])
    return texto

In [18]:
#Remover Numeros

def remover_numero(texto):
    texto = ''.join([i for i in texto if not i.isdigit()])
    return texto

In [19]:
#Remover espacios en Blanco (extras/tabs)

def remover_espacio_extra(texto):
    import re
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', texto).strip()

In [20]:
#Remover Stop-Word

def remover_stop_word(texto):
    return " ".join ([word for word in word_tokenize(texto) if not word in stopwords.words('english')])

In [21]:
#Lematizar

def lematizar(texto):
    lemma = WordNetLemmatizer()
    return " ".join([lemma.lemmatize(word) for word in word_tokenize(texto)])

## 3.1 Preprocesamiento del Dataset 'train' 

Realizo el preprocesamiento de la comlumna **'text'** del dataset **'train'**

In [22]:
df_train['text_preprocesado'] = df_train['text'].apply(lambda x: minuscula(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_url(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_usuario(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_emoji(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: expandir_abreviatura(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: expandir_contraccion(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_caracter_especial(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_tag_html(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_acento(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_punto(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_numero(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_espacio_extra(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: remover_stop_word(x))
df_train['text_preprocesado'] = df_train['text_preprocesado'].apply(lambda x: lematizar(x))

In [23]:
#Observo las 5 primeras filas

df_train.head()

Unnamed: 0,id,keyword,location,text,target,text_preprocesado
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


In [24]:
df_train.shape

(7613, 6)

In [25]:
x_train_original = df_train['text']

In [26]:
x_train_preprocesado = df_train['text_preprocesado']

In [27]:
y_train = df_train['target']

## 3.2 Preprocesamiento del Dataset 'test' 

Realizo el preprocesamiento de la comlumna **'text'** del dataset **'test'**

In [28]:
df_test['text_preprocesado'] = df_test['text'].apply(lambda x: minuscula(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_url(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_usuario(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_emoji(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: expandir_abreviatura(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: expandir_contraccion(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_caracter_especial(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_tag_html(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_acento(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_punto(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_numero(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_espacio_extra(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: remover_stop_word(x))
df_test['text_preprocesado'] = df_test['text_preprocesado'].apply(lambda x: lematizar(x))

In [29]:
#Observo las 5 primeras filas

df_test.head()

Unnamed: 0,id,keyword,location,text,text_preprocesado
0,0,,,Just happened a terrible car crash,happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan


In [30]:
df_test.shape

(3263, 5)

In [31]:
x_test_original = df_test['text']

In [32]:
x_test_preprocesado = df_test['text_preprocesado']

# 4. Vectorizacion

## 4.1 Bags of words (BOW)

### 4.1.1 Texto Original 

In [33]:
#Realizo la llamada a la clase CountVectorizer()

vectorizador_bow_original = CountVectorizer()

#### Dataset de entrenamiento

In [34]:
vect_bow_train_original = vectorizador_bow_original.fit_transform(x_train_original)

In [35]:
vect_bow_train_original.shape

(7613, 21637)

#### Dataset de test

In [36]:
vect_bow_test_original = vectorizador_bow_original.transform(x_test_original)

In [37]:
vect_bow_test_original.shape

(3263, 21637)

### 4.1.2 Texto Preprocesado

In [38]:
#Realizo la llamada a la clase CountVectorizer()

vectorizador_bow_preprocesado = CountVectorizer()

#### Dataset de entrenamiento

In [39]:
vect_bow_train_preprocesado = vectorizador_bow_preprocesado.fit_transform(x_train_preprocesado)

In [40]:
vect_bow_train_preprocesado.shape

(7613, 13423)

#### Dataset de test

In [41]:
vect_bow_test_preprocesado = vectorizador_bow_preprocesado.transform(x_test_preprocesado)

In [42]:
vect_bow_test_preprocesado.shape

(3263, 13423)

## 4.2 TF-IDF

### 4.2.1 Texto Original 

In [43]:
#Realizo la llamada a la clase CountVectorizer()

vectorizador_tfidf_original = TfidfVectorizer()

#### Dataset de entrenamiento

In [44]:
vect_tfidf_train_original = vectorizador_tfidf_original.fit_transform(x_train_original)

In [45]:
vect_tfidf_train_original.shape

(7613, 21637)

#### Dataset de test

In [46]:
vect_tfidf_test_original = vectorizador_tfidf_original.transform(x_test_original)

In [47]:
vect_tfidf_test_original.shape

(3263, 21637)

### 4.1.2 Texto Preprocesado

In [48]:
#Realizo la llamada a la clase CountVectorizer()

vectorizador_tfidf_preprocesado = TfidfVectorizer()

#### Dataset de entrenamiento

In [49]:
vect_tfidf_train_preprocesado = vectorizador_tfidf_preprocesado.fit_transform(x_train_preprocesado)

In [50]:
vect_tfidf_train_preprocesado.shape

(7613, 13423)

#### Dataset de test

In [51]:
vect_tfidf_test_preprocesado = vectorizador_tfidf_preprocesado.transform(x_test_preprocesado)

In [52]:
vect_tfidf_test_preprocesado.shape

(3263, 13423)

In [53]:
pipeline_tfidf = Pipeline([('tfidf',TfidfVectorizer())])

# 5. Pipelines

## 5.1 Pipeline BOW

In [54]:
pipeline_bow_gradient_boosting = Pipeline([('vectorizador', CountVectorizer()),('clf', GradientBoostingClassifier())])

## 5.2 Pipeline TFIDF 

In [55]:
pipeline_tfidf_gradient_boosting = Pipeline([('vectorizador',TfidfVectorizer()),('clf', GradientBoostingClassifier())])

# 6 Tuning de Parametros - GridSearch

## 6.1 BOW

### 6.1.1 BOW - Texto sin Preprocesar - Vectorizador y Modelo sin optimizar Hiperparametros

In [56]:
#No se setean variantes a los Hiperparametros

grid_parametros = {}

clf_1 = GridSearchCV(pipeline_bow_gradient_boosting, grid_parametros,cv=5, n_jobs=-1,scoring='f1')
clf_1.fit(x_train_original, y_train)

print("Mejor Score: ", clf_1.best_score_)
print("Mejores Parametros: ", clf_1.best_params_)

Mejor Score:  0.5310176774786736
Mejores Parametros:  {}


Generacion del SUBMIT

Leo el archivo .csv modelo que tenemos se utilizar para realizar el submit a Kaggle

In [57]:
submission = pd.read_csv('sample_submission.csv')

Creo una nueva columna con los valores que predice el modelo

In [58]:
submission['target'] = clf_1.predict(x_test_original)

Guardo el archivo .csv para realizar el submit a Kaggle

In [59]:
submission.to_csv("original_bow_nohiper_gradient_boosting.csv", index=False)

### 6.1.2 BOW - Texto Preprocesado - Vectorizador y Modelo sin optimizar Hiperparametros

In [60]:
#No se setean variantes a los Hiperparametros

grid_parametros = {}

clf_2 = GridSearchCV(pipeline_bow_gradient_boosting, grid_parametros,cv=5, n_jobs=-1,scoring='f1')
clf_2.fit(x_train_preprocesado, y_train)

print("Mejor Score: ", clf_2.best_score_)
print("Mejores Parametros: ", clf_2.best_params_)

Mejor Score:  0.3896248975165339
Mejores Parametros:  {}


Generacion del SUBMIT

Leo el archivo .csv modelo que tenemos se utilizar para realizar el submit a Kaggle

In [61]:
submission = pd.read_csv('sample_submission.csv')

Creo una nueva columna con los valores que predice el modelo

In [62]:
submission['target'] = clf_2.predict(x_test_preprocesado)

Guardo el archivo .csv para realizar el submit a Kaggle

In [63]:
submission.to_csv("preprocesado_bow_nohiper_gradient_boosting.csv", index=False)

### 6.1.3 BOW - Texto sin Preprocesar - Vectorizador y Modelo optimizando Hiperparametros

In [66]:
grid_parametros = {'clf__learning_rate': [0.05,0.1,0.2],
                   'vectorizador__ngram_range': [(1,1),(1,2),(1,3)]}
                   
clf_3 = GridSearchCV(pipeline_bow_gradient_boosting, grid_parametros,cv=5, n_jobs=-1,scoring='f1')
clf_3.fit(x_train_original, y_train)

print("Mejor Score: ", clf_3.best_score_)
print("Mejores Parametros: ", clf_3.best_params_)

Mejor Score:  0.5583521224945058
Mejores Parametros:  {'clf__learning_rate': 0.2, 'vectorizador__ngram_range': (1, 1)}


Generacion del SUBMIT

Leo el archivo .csv modelo que tenemos se utilizar para realizar el submit a Kaggle

In [67]:
submission = pd.read_csv('sample_submission.csv')

Creo una nueva columna con los valores que predice el modelo

In [68]:
submission['target'] = clf_3.predict(x_test_original)

Guardo el archivo .csv para realizar el submit a Kaggle

In [69]:
submission.to_csv("original_bow_hiper_gradient_boosting.csv", index=False)

### 6.1.4 BOW - Texto Preprocesado - Vectorizador y Modelo optimizando Hiperparametros

In [70]:
grid_parametros = {'clf__learning_rate': [0.05,0.1,0.2],
                   'vectorizador__ngram_range': [(1,1),(1,2),(1,3)]}
                   
clf_4 = GridSearchCV(pipeline_bow_gradient_boosting, grid_parametros,cv=5, n_jobs=-1,scoring='f1')
clf_4.fit(x_train_preprocesado, y_train)

print("Mejor Score: ", clf_4.best_score_)
print("Mejores Parametros: ", clf_4.best_params_)

Mejor Score:  0.42501127121750654
Mejores Parametros:  {'clf__learning_rate': 0.2, 'vectorizador__ngram_range': (1, 3)}


Generacion del SUBMIT

Leo el archivo .csv modelo que tenemos se utilizar para realizar el submit a Kaggle

In [71]:
submission = pd.read_csv('sample_submission.csv')

Creo una nueva columna con los valores que predice el modelo

In [72]:
submission['target'] = clf_4.predict(x_test_preprocesado)

Guardo el archivo .csv para realizar el submit a Kaggle

In [73]:
submission.to_csv("preprocesado_bow_hiper_gradient_boosting.csv", index=False)

## 6.2 TF-IDF 

### 6.2.1 TFIDF - Texto sin Preprocesar - Vectorizador y Modelo sin optimizar Hiperparametros

In [74]:
#No se setean variantes a los Hiperparametros

grid_parametros = {}

clf_5 = GridSearchCV(pipeline_tfidf_gradient_boosting, grid_parametros,cv=5, n_jobs=-1,scoring='f1')
clf_5.fit(x_train_original, y_train)

print("Mejor Score: ", clf_5.best_score_)
print("Mejores Parametros: ", clf_5.best_params_)

Mejor Score:  0.550511437147054
Mejores Parametros:  {}


Generacion del SUBMIT

Leo el archivo .csv modelo que tenemos se utilizar para realizar el submit a Kaggle

In [75]:
submission = pd.read_csv('sample_submission.csv')

Creo una nueva columna con los valores que predice el modelo

In [76]:
submission['target'] = clf_5.predict(x_test_original)

Guardo el archivo .csv para realizar el submit a Kaggle

In [77]:
submission.to_csv("original_tfidf_nohiper_gradient_boosting.csv", index=False)

### 6.2.2 TFIDF - Texto Preprocesado - Vectorizador y Modelo sin optimizar Hiperparametros

In [78]:
#No se setean variantes a los Hiperparametros

grid_parametros = {}

clf_6 = GridSearchCV(pipeline_tfidf_gradient_boosting, grid_parametros,cv=5, n_jobs=-1,scoring='f1')
clf_6.fit(x_train_preprocesado, y_train)

print("Mejor Score: ", clf_6.best_score_)
print("Mejores Parametros: ", clf_6.best_params_)

Mejor Score:  0.3797253791200449
Mejores Parametros:  {}


Generacion del SUBMIT

Leo el archivo .csv modelo que tenemos se utilizar para realizar el submit a Kaggle

In [79]:
submission = pd.read_csv('sample_submission.csv')

Creo una nueva columna con los valores que predice el modelo

In [80]:
submission['target'] = clf_6.predict(x_test_preprocesado)

Guardo el archivo .csv para realizar el submit a Kaggle

In [81]:
submission.to_csv("preprocesado_tfidf_nohiper_gradient_boosting.csv", index=False)

### 6.2.3 TFIDF - Texto sin Preprocesar - Vectorizador y Modelo optimizando Hiperparametros

In [83]:
grid_parametros = {'clf__learning_rate': [0.05,0.1,0.2],
                   'vectorizador__ngram_range': [(1,1),(1,2),(1,3)]}
                   
clf_7 = GridSearchCV(pipeline_tfidf_gradient_boosting, grid_parametros,cv=5, n_jobs=-1,scoring='f1')
clf_7.fit(x_train_original, y_train)

print("Mejor Score: ", clf_7.best_score_)
print("Mejores Parametros: ", clf_7.best_params_)

Mejor Score:  0.573083814590637
Mejores Parametros:  {'clf__learning_rate': 0.2, 'vectorizador__ngram_range': (1, 1)}


Generacion del SUBMIT

Leo el archivo .csv modelo que tenemos se utilizar para realizar el submit a Kaggle

In [84]:
submission = pd.read_csv('sample_submission.csv')

Creo una nueva columna con los valores que predice el modelo

In [85]:
submission['target'] = clf_7.predict(x_test_original)

Guardo el archivo .csv para realizar el submit a Kaggle

In [86]:
submission.to_csv("original_tfidf_hiper_gradient_boosting.csv", index=False)

### 6.2.4 TFIDF - Texto Preprocesado - Vectorizador y Modelo optimizando Hiperparametros

In [87]:
grid_parametros = {'clf__learning_rate': [0.05,0.1,0.2],
                   'vectorizador__ngram_range': [(1,1),(1,2),(1,3)]}
                   
clf_8 = GridSearchCV(pipeline_tfidf_gradient_boosting, grid_parametros,cv=5, n_jobs=-1,scoring='f1')
clf_8.fit(x_train_preprocesado, y_train)

print("Mejor Score: ", clf_8.best_score_)
print("Mejores Parametros: ", clf_8.best_params_)

Mejor Score:  0.4320185504208044
Mejores Parametros:  {'clf__learning_rate': 0.2, 'vectorizador__ngram_range': (1, 2)}


Generacion del SUBMIT

Leo el archivo .csv modelo que tenemos se utilizar para realizar el submit a Kaggle

In [88]:
submission = pd.read_csv('sample_submission.csv')

Creo una nueva columna con los valores que predice el modelo

In [89]:
submission['target'] = clf_8.predict(x_test_preprocesado)

Guardo el archivo .csv para realizar el submit a Kaggle

In [90]:
submission.to_csv("preprocesado_tfidf_hiper_gradient_boosting.csv", index=False)