In [1]:
from sklearn import ensemble, linear_model, pipeline
from beat_the_streak.transformers import FeatureSelector
from beat_the_streak.list_subtract import subtract
from beat_the_streak.players import PlayerModel
from sklearn.ensemble import VotingClassifier
from beat_the_streak.dataset import test_train_split, train_cols
from beat_the_streak.metrics import BestPickForEachDayGotHitPercent

In [5]:
def forest_factory_factory(*args, **kargs):
    def factory():
        return ensemble.RandomForestClassifier(*args, **kargs)
    return factory

def log_factory():
    return linear_model.LogisticRegression()

def log_factory100():
    return linear_model.LogisticRegression(C=100)

def log_factory5():
    return linear_model.LogisticRegression(C=.5)

possible_factories = [  
    log_factory,
    log_factory100,
    log_factory5,
    forest_factory_factory(min_samples_split=2, n_estimators=5, max_features='sqrt'),
    forest_factory_factory(min_samples_split=2, n_estimators=5, max_features='auto'),

    forest_factory_factory(min_samples_split=20, n_estimators=15, max_features='sqrt'),
    forest_factory_factory(min_samples_split=20, n_estimators=15, max_features='auto'),
    
    forest_factory_factory(min_samples_split=2, n_estimators=10, max_features='sqrt'),
    forest_factory_factory(min_samples_split=2, n_estimators=10, max_features='auto'),

    forest_factory_factory(min_samples_split=20, n_estimators=10, max_depth=100, max_features='auto'),
    forest_factory_factory(min_samples_split=20, n_estimators=10, max_depth=10, max_features='auto'),
    
    forest_factory_factory(min_samples_split=20, n_estimators=10, max_depth=100, max_features='sqrt'),
    forest_factory_factory(min_samples_split=20, n_estimators=10, max_depth=10, max_features='auto'),
    
    forest_factory_factory(max_leaf_nodes=100, min_samples_split=20, n_estimators=10, max_features='sqrt'),
    forest_factory_factory(max_leaf_nodes=100, min_samples_split=20, n_estimators=10, max_features='auto'),
    
    forest_factory_factory(max_leaf_nodes=100, min_samples_split=2, n_estimators=5, max_features='sqrt'),
    forest_factory_factory( max_leaf_nodes=100, min_samples_split=2, n_estimators=5, max_features='auto'),
    
    forest_factory_factory(max_leaf_nodes=100, min_samples_split=20, n_estimators=15, max_features='sqrt'),
    forest_factory_factory(max_leaf_nodes=100, min_samples_split=20, n_estimators=15, max_features='auto'),
    
    forest_factory_factory(max_leaf_nodes=100, min_samples_split=2, n_estimators=10, max_features='sqrt'),
    forest_factory_factory(max_leaf_nodes=100, min_samples_split=2, n_estimators=10, max_features='auto'),

    forest_factory_factory(max_leaf_nodes=100, min_samples_split=20, n_estimators=10, max_features='sqrt'),
    forest_factory_factory(max_leaf_nodes=100, min_samples_split=20, n_estimators=10, max_features='auto'),
    
    forest_factory_factory(max_leaf_nodes=20, min_samples_split=20, n_estimators=10, max_features='sqrt'),
    forest_factory_factory(max_leaf_nodes=20, min_samples_split=20, n_estimators=10, max_features='auto'),
    
    forest_factory_factory(max_leaf_nodes=20, min_samples_split=2, n_estimators=5, max_features='sqrt'),
    forest_factory_factory( max_leaf_nodes=20, min_samples_split=2, n_estimators=5, max_features='auto'),
    
    forest_factory_factory(max_leaf_nodes=20, min_samples_split=20, n_estimators=15, max_features='sqrt'),
    forest_factory_factory(max_leaf_nodes=20, min_samples_split=20, n_estimators=15, max_features='auto'),
    
    forest_factory_factory(max_leaf_nodes=20, min_samples_split=2, n_estimators=10, max_features='sqrt'),
    forest_factory_factory(max_leaf_nodes=20, min_samples_split=2, n_estimators=10, max_features='auto'),

    forest_factory_factory(max_leaf_nodes=20, min_samples_split=20, n_estimators=10, max_features='auto'),
    forest_factory_factory(max_leaf_nodes=20, min_samples_split=20, n_estimators=10, max_features='sqrt'),
]


tuned_parameters = {    
    'weights': [[1,1,.8], [1,1,1], [1,1,1.2], [1,1.2,1]],
    'voting': ['soft', 'hard'],
    'lg__clf__C': [1.0],
    
    'ply__clf__model_cls': [possible_factories],
    
    'rf__clf__n_estimators': [25],
    'rf__clf__max_features': ['auto'],
    'rf__clf__max_depth': [15,12],
    'rf__clf__min_samples_split': [1],
    'rf__clf__max_leaf_nodes': [10], 
}

In [None]:
from beat_the_streak import dataset
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import LabelKFold

choices = dataset.load_dataset_starting_at_day('2015-05-30')


rf = pipeline.Pipeline([
        ('sel', FeatureSelector(subtract(train_cols, ['player_hash']))),
        ('clf', ensemble.RandomForestClassifier())])
lg = pipeline.Pipeline([
        ('sel', FeatureSelector(subtract(train_cols, ['player_hash']))),
        ('clf', linear_model.LogisticRegression())]) 
ply = pipeline.Pipeline([
        ('sel', FeatureSelector(subtract(train_cols, ['hitting_average']))),
        ('clf', PlayerModel())])

voting_model = VotingClassifier(estimators=[('rf', rf), ('lg', lg), ('ply', ply)])

grid = GridSearchCV(voting_model,
                    tuned_parameters, 
                    cv=LabelKFold(choices.game_date, n_folds=10),
                    n_jobs=6,
                    scoring=BestPickForEachDayGotHitPercent(choices.game_date, number_of_choices_per_day=10))     

grid.fit(choices[dataset.train_cols], choices.got_hit)