In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from datetime import datetime


In [2]:
train = pd.read_csv('train.csv')

In [3]:
#Parametros a usar de manera aleatoria
params = {

        'learning_rate': [ 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'n_estimators': [200 ,400 ,600, 800, 1000]
    }

#Solo sirve para calcular el tiempo que lleva el random search
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [4]:
def random_search_func(x, y):

    folds = 5
    param_comb = 40   #Tarda 30 mins. en usar 40 combinaciones
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
    
    xgb = XGBClassifier(objective='binary:logistic',
                    silent=True, nthread=1)

    random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='f1', n_jobs=4, cv=skf.split(x,y), verbose=3, random_state=1001 )

    
    start_time = timer(None) 
    random_search.fit(x, y)
    timer(start_time)   

    print('\n Mejor f1 score con %d-folds y %d combinaciones de parametros:' % (folds, param_comb))
    print(random_search.best_score_)
    print('\n Best hyperparameters:')
    print(random_search.best_params_)

In [5]:
x_train=train[['id','keyword','location','text']]

y_train=train[['id','target']]

#x_train['link']=x_train['text'].str.contains("http:")
#x_train['link'] = x_train['link'].astype(int)
#x_train['len'] = x_train['text'].transform(lambda x : len(x))

v = TfidfVectorizer(stop_words = 'english', analyzer = 'word')
x_tfidf = v.fit_transform(x_train['text'])


xtrain, xvalid, ytrain, yvalid=train_test_split(x_tfidf,y_train,test_size=0.1,random_state=1)

In [None]:
random_search_func(xtrain,ytrain['target'])

In [6]:
clf = XGBClassifier(max_depth=5, n_estimators=800, colsample_bytree=1,
                        subsample=1, nthread=10, learning_rate=0.2)

xgb_train=clf.fit(xtrain, ytrain['target'])

prediction = clf.predict(xvalid)

In [7]:
accuracy_score(yvalid['target'], prediction)

0.7821522309711286

In [8]:
f1_score(prediction, yvalid['target'])

0.7186440677966102