In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 数据获取
data=pd.read_csv('./data/otto/train.csv')
data.describe()

In [None]:
data.shape

In [None]:
data.head()

In [None]:
# 数据可视化
sns.countplot(data.target)
plt.show()

In [None]:
# 数据基本处理

In [None]:
new_data = data[:10000]
new_data.shape

In [None]:
sns.countplot(new_data.target)
plt.show()

In [None]:
# 随机欠采样获取数据
# 首先需要确定特征值

y=data['target']
x=data.drop(['id', 'target'], axis=1)

In [None]:
x.head()

In [None]:
y.head()

In [None]:
# 欠采样获取数据
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)

x_resampled, y_resampled = rus.fit_resample(x,y)

In [None]:
x_resampled.shape

In [None]:
y_resampled.shape

In [None]:
 sns.countplot(y_resampled)

In [None]:
# 把标签值转换为数字
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
y_resampled=le.fit_transform(y_resampled)

In [None]:
# 分割数据
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2)

In [None]:
# 模型训练
# 基本数据处理
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier(oob_score=True)
rf.fit(x_train, y_train)

In [None]:
y_pre=rf.predict(x_test)
y_pre

In [None]:
rf.score(x_test, y_test)

In [None]:
rf.oob_score_

In [None]:
sns.countplot(y_pre)
plt.show()

In [None]:
# logloss模型评估
from sklearn.metrics import log_loss

log_loss(y_test, y_pre, eps=1e-15, normalize=True)

上面报错原因是logloss使用过程要求输出用one-hot表示
需要将多类别问题的输出结果通过OneHotEncoder修改为如下：

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(sparse=False)
y_test1 = one_hot.fit_transform(y_test.reshape(-1, 1))
y_pre1 = one_hot.fit_transform(y_pre.reshape(-1,1))

In [None]:
log_loss(y_test1, y_pre1, eps=1e-15, normalize=True)

In [None]:
# 改变预测值的输出模式，让输出结果为百分占比降低logloss值
y_pre_proba = rf.predict_proba(x_test)
y_pre_proba

# 模型调优
n_extimators, max_feature, max_depth, min_sample_leaf

In [None]:
# 确定最优的n_estimators
tuned_parameters=range(10,200,10)
# 创建添加accuracy的numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的numpy
error_t = np.zeros(len(tuned_parameters))

# 调优过程实现
for j, one_estimator in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(n_estimators=one_estimator, 
                                 max_depth=10, 
                                 max_features=10, 
                                 min_samples_split=10, 
                                 oob_score=True, 
                                 random_state=0, 
                                 n_jobs=-1)
    rf2.fit(x_train, y_train)
    
    # 输出accuracy
    accuracy_t[j]=rf2.oob_score_
    
    # 输出logloss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test, y_pre, eps=1e-5, normalize=True)
    
    print(error_t)
    

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)

axes[0].set_xlabel('n_estimators')
axes[0].set_ylabel('error_t')
axes[1].set_xlabel('n_estimators')
axes[1].set_ylabel('accuracy_t')

axes[0].grid(True)
axes[1].grid(True)

plt.show()

In [None]:
# 确定最优的max_features的范围
tuned_parameters=range(5,40,5)
# 创建添加accuracy的numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的numpy
error_t = np.zeros(len(tuned_parameters))

# 调优过程实现
for j, one_estimator in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(n_estimators=175, 
                                 max_depth=10, 
                                 max_features=one_estimator, 
                                 min_samples_split=10, 
                                 oob_score=True, 
                                 random_state=0, 
                                 n_jobs=-1)
    rf2.fit(x_train, y_train)
    
    # 输出accuracy
    accuracy_t[j]=rf2.oob_score_
    
    # 输出logloss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test, y_pre, eps=1e-5, normalize=True)
    
    print(error_t)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)

axes[0].set_xlabel('max_features')
axes[0].set_ylabel('error_t')
axes[1].set_xlabel('max_features')
axes[1].set_ylabel('accuracy_t')

axes[0].grid(True)
axes[1].grid(True)

plt.show()

In [None]:
# 确定最优的max_depth的范围
tuned_parameters=range(10,100,10)
# 创建添加accuracy的numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的numpy
error_t = np.zeros(len(tuned_parameters))

# 调优过程实现
for j, one_estimator in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(n_estimators=175, 
                                 max_depth=one_estimator, 
                                 max_features=15, 
                                 min_samples_split=10, 
                                 oob_score=True, 
                                 random_state=0, 
                                 n_jobs=-1)
    rf2.fit(x_train, y_train)
    
    # 输出accuracy
    accuracy_t[j]=rf2.oob_score_
    
    # 输出logloss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test, y_pre, eps=1e-5, normalize=True)
    
    print(error_t)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)

axes[0].set_xlabel('max_depth')
axes[0].set_ylabel('error_t')
axes[1].set_xlabel('max_depth')
axes[1].set_ylabel('accuracy_t')

axes[0].grid(True)
axes[1].grid(True)

plt.show()

In [None]:
# 确定最优的min_samples_leaf的范围
tuned_parameters=range(1,10,2)
# 创建添加accuracy的numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的numpy
error_t = np.zeros(len(tuned_parameters))

# 调优过程实现
for j, one_estimator in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(n_estimators=175, 
                                 max_depth=10, 
                                 max_features=30, 
                                 min_samples_leaf=one_estimator, 
                                 oob_score=True, 
                                 random_state=0, 
                                 n_jobs=-1)
    rf2.fit(x_train, y_train)
    
    # 输出accuracy
    accuracy_t[j]=rf2.oob_score_
    
    # 输出logloss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test, y_pre, eps=1e-5, normalize=True)
    
    print(error_t)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)

axes[0].set_xlabel('min_samples_leaf')
axes[0].set_ylabel('error_t')
axes[1].set_xlabel('min_samples_leaf')
axes[1].set_ylabel('accuracy_t')

axes[0].grid(True)
axes[1].grid(True)

plt.show()

In [None]:
# 最优模型
rf3 = RandomForestClassifier(n_estimators=175, max_depth=30, max_features=15, min_samples_leaf=1, oob_score=True, random_state=40)

rf3.fit(x_train, y_train)
rf3.score(x_test, y_test)

In [None]:
rf3.oob_score_

In [None]:
y_pre_proba1 = rf3.predict_proba(x_test)
log_loss(y_test, y_pre_proba1)

In [None]:
 # 生成提交数据
test_data = pd.read_csv('./data/otto/test.csv')
test_data.head()

In [None]:
test_data = test_data.drop(['id'],axis=1 )
y_pre_test = rf3.predict_proba(test_data)
y_pre_test

In [None]:
result_data =  pd.DataFrame(y_pre_test, columns=['Class_' + str(i) for i in range(1,10)])
result_data.head()

In [None]:
result_data.insert(loc=0, column='id', value=test_data.id)