## Random Forrest

In [8]:
import os
import sys
import scipy
import scipy.sparse
import pandas as pd
import numpy as np
import xgboost as xgb
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score

PROJECT_DIR = '/home/michal/diplomka/code'
if PROJECT_DIR not in sys.path:
    sys.path.append(PROJECT_DIR)
from lib.utils import load_csr_matrix

In [5]:
FEATURES = ['a_hypernyms', 'a_head_form', 'a_head_number', 'a_non_article_det', 'a_parent', 'a_pos_after_head', 'a_pos_before_head', 'a_words_after_head', 'a_words_after_np', 'a_words_before_head', 'a_words_before_np', 'b_head_proper', 'b_head_pos_simple', 'b_object_form', 'b_pos_after_head_as_list', 'b_pos_before_head_as_list', 'b_pp_object_form', 'b_postmodification_type', 'b_referent', 'b_words_after_head_as_list', 'b_words_after_np_as_list', 'b_words_before_head_as_list', 'b_words_before_np_as_list', 'c_countability_bnc', 'd_head_form_embeddings', 'e_kenlm_ggl_5_lc_nbs']
DATA_PATH = os.path.join(PROJECT_DIR, '../data/features/penn/postprocessed')
CV_RESULTS_LOG_PATH = os.path.join(PROJECT_DIR, 'logs/experiments/model_results/penn/rf')  # logovani vysledku z cross-validace
#NUM_CLASS = 3
TRAIN_SET_NAME = 'train'
TEST_SET_NAME = 'heldout'

In [6]:
def load_data(features, dataset_name, sample=None):
    y = np.load(os.path.join(DATA_PATH, dataset_name, 'Y_article'))
    if sample:
        np.random.seed(seed=42)
        sample_indices = [np.random.choice(len(y), sample, replace=False)]
        y = y[sample_indices]
        
    feature_matrices = []
    for feature_name in features:
        feature_matrix = load_csr_matrix(
            os.path.join(DATA_PATH, dataset_name, feature_name + '.npz')
        )
        if sample:
            feature_matrix = scipy.sparse.csr_matrix(feature_matrix.toarray()[sample_indices])
        feature_matrices.append(feature_matrix)
    x =  scipy.sparse.hstack(feature_matrices)
    assert x.shape[0] == len(y)
    return x, y

### Ladění

In [7]:
RF_DEFAULT_PARAMS = {
    'criterion' : 'gini',
    'max_features': 'sqrt',
    'verbose': 10,
    'random_state': 284,
    'n_jobs': -1,
}

In [None]:
def get_nb_of_trees(train_x, train_y, model_params, log_file):
    model = RandomForestClassifier(**dict(model_params, warm_start=True)