In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [34]:
# 导入训练集数据
monster = pd.read_csv("..//input/train.csv")

# 输出统计信息
print(monster.describe())
print("------")
print(monster.info())
print("------")
print(monster.head())

               id  bone_length  rotting_flesh  hair_length    has_soul
count  371.000000   371.000000     371.000000   371.000000  371.000000
mean   443.676550     0.434160       0.506848     0.529114    0.471392
std    263.222489     0.132833       0.146358     0.169902    0.176129
min      0.000000     0.061032       0.095687     0.134600    0.009402
25%    205.500000     0.340006       0.414812     0.407428    0.348002
50%    458.000000     0.434891       0.501552     0.538642    0.466372
75%    678.500000     0.517223       0.603977     0.647244    0.600610
max    897.000000     0.817001       0.932466     1.000000    0.935721
------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 7 columns):
id               371 non-null int64
bone_length      371 non-null float64
rotting_flesh    371 non-null float64
hair_length      371 non-null float64
has_soul         371 non-null float64
color            371 non-null object
type             371 non-

In [35]:
# 类型转换为序列号
color_label = LabelEncoder()
monster['color'] = color_label.fit(monster['color']).transform(monster['color'])

type_label = LabelEncoder()
monster['type'] = type_label.fit(monster['type']).transform(monster['type'])

In [36]:
# feature select
def FeatureSelect(X, y, feature_name):
    
    
    rf = RandomForestClassifier()
    rf.fit(X, y)
    return sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), feature_names), reverse=True)

In [37]:
def create_features(dataframe):
    #Create some new variables.
    dataframe['hair_soul'] = dataframe['hair_length'] * dataframe['has_soul']
    dataframe['bone_soul'] = dataframe['bone_length'] * dataframe['has_soul']
    dataframe['hair_bone'] = dataframe['hair_length'] * dataframe['bone_length']
    dataframe['rotting_hair'] = dataframe['rotting_flesh'] * dataframe['hair_length']
    dataframe['rotting_soul'] = dataframe['rotting_flesh'] * dataframe['has_soul']
create_features(monster)

feature_names = ["bone_length", "rotting_flesh", "hair_length", "has_soul", "color", "hair_soul","bone_soul", "hair_bone", "rotting_hair", "rotting_soul",]

X = monster[feature_names]
y = monster['type']

feature_importance = FeatureSelect(X, y, feature_names)
print feature_importance
feature_select = []
for i in feature_importance:
    if i[0] > 0.03:
        feature_select.append(i[1])
print feature_select
X = monster[feature_select]

[(0.1813, 'hair_soul'), (0.1639, 'bone_soul'), (0.1269, 'hair_bone'), (0.1199, 'hair_length'), (0.0914, 'rotting_flesh'), (0.089, 'has_soul'), (0.0883, 'bone_length'), (0.0629, 'rotting_soul'), (0.0611, 'rotting_hair'), (0.0153, 'color')]
['hair_soul', 'bone_soul', 'hair_bone', 'hair_length', 'rotting_flesh', 'has_soul', 'bone_length', 'rotting_soul', 'rotting_hair']


In [38]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=2016)

In [39]:
param_grid = {
    'solver' : ['newton-cg', 'lbfgs'],
                  'multi_class' : ['ovr', 'multinomial'],
                  'C' : [0.005, 0.01, 1, 10, 100, 1000],
                  'tol': [0.0001, 0.001, 0.005]
}

from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best Score:%s" % grid_search.best_score_)
print("Best param:%s" % grid_search.best_params_)

Best Score:0.760617760618
Best param:{'multi_class': 'multinomial', 'C': 1, 'tol': 0.0001, 'solver': 'newton-cg'}


In [40]:
log_reg = LogisticRegression(multi_class="multinomial", C=1, tol=0.0001, solver='newton-cg')
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)

0.7053571428571429

In [51]:
param_grid = {
                'n_estimators' : [100, 150],
                'criterion' : ['gini'],
                'max_features' : ['auto'],
                'max_depth' : [5, 20, 100],
                'min_samples_split' : [2, 5, 7],
                'min_weight_fraction_leaf' : [0.0, 0.1],
                'max_leaf_nodes' : [40, 60, 80],
}

rfc = RandomForestClassifier(criterion='entropy', n_jobs=-1)
grid_search = GridSearchCV(rfc, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best Score:%s" % grid_search.best_score_)
print("Best param:%s" % grid_search.best_params_)

Best Score:0.772200772201
Best param:{'max_leaf_nodes': 60, 'n_estimators': 10, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.1, 'criterion': 'gini', 'max_features': 'auto', 'max_depth': 20}


In [49]:
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()

svc = svm.SVC(kernel='linear')
svc.fit(X_train, y_train)
print ("svc score:%s " %svc.score(X_test, y_test))

Vclf1 = VotingClassifier(estimators=[('LR', log_reg), 
                                     ('GNB', gnb), ('SVC', svc)], voting='hard')
Vclf = VotingClassifier(estimators=[('LR', log_reg),
                                     ('GNB', gnb), ('SVC', svc)], voting='soft', weights=[1,1,1])

Vclf1.fit(X_train, y_train)


svc score:0.669642857143 


AttributeError: predict_proba is not available when  probability=False