In [1]:
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
train_df=pd.read_csv('./titanic/train.csv')
test_df=pd.read_csv('./titanic/test.csv')
import warnings
warnings.filterwarnings('ignore')

#重复上一节的操作...
train_df['Cabin'].fillna('missing',inplace=True)
test_df['Cabin'].fillna('missing',inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0],inplace=True)
train_df['Age'].fillna(train_df['Age'].mean(),inplace=True)
test_df['Age'].fillna(train_df['Age'].mean(),inplace=True)
test_df['Fare'].fillna(train_df['Fare'].mean(),inplace=True)
import category_encoders as ce
del train_df['Name']
del train_df['Ticket']
del test_df['Name']
del test_df['Ticket']
del train_df['PassengerId']
del test_df['PassengerId']
label=train_df["Survived"]
del train_df["Survived"]
# target 
target_encoder = ce.TargetEncoder(cols=['Embarked','Cabin']).fit(train_df,label)
train_df=target_encoder.transform(train_df)
test_df=target_encoder.transform(test_df)

# one hot
onehot_encoder = ce.OneHotEncoder(cols=['Sex']).fit(train_df)
train_df=onehot_encoder.transform(train_df)
test_df=onehot_encoder.transform(test_df)

from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer
#z-score归一化为例
standard_scaler=StandardScaler()
standard_scaler.fit(train_df)
new_train_df=pd.DataFrame(standard_scaler.transform(train_df),columns=train_df.columns)
new_test_df=pd.DataFrame(standard_scaler.transform(test_df),columns=train_df.columns)

In [2]:
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=2,include_bias=False,interaction_only=False)#
poly_fea_np=poly.fit_transform(train_df)
poly_fea_df=pd.DataFrame(poly_fea_np,columns=poly.get_feature_names())

In [3]:
poly_fea_test_np=poly.transform(test_df)
poly_fea_test_df=pd.DataFrame(poly_fea_test_np,columns=poly.get_feature_names())


主要考虑提供更多数据给模型训练，包括两方面：  

（1）利用其余的未标记数据进行无监督学习，在我们的标记数据进行监督学习（半监督学习），比如nlp任务中收集海量的文本数据训练embedding，然后再在其他nlp任务上做fine tuning；  

（2）在当前数据的基础上造出相似的数据，比如nlp任务中删除某一个词、替换同义词...，cv任务中缩放、旋转、翻转图片、gan...

### 一.半监督学习
这里没有多余的feature数据，我们假设test部分就是多出来的部分；   
在pca上做对比...

In [4]:
#增强前
from sklearn.decomposition import PCA
X_pca=PCA(n_components=20).fit_transform(poly_fea_df)
#gbdt
classifier=GradientBoostingClassifier()
scores = cross_val_score(classifier,X_pca, label, scoring='f1', cv = 5)
np.mean(scores),np.std(scores)

(0.7653568059407474, 0.02975655731469058)

In [5]:
#增强后
X_pca=PCA(n_components=20).fit_transform(np.concatenate([poly_fea_df,poly_fea_test_df]))
#gbdt
classifier=GradientBoostingClassifier()
scores = cross_val_score(classifier,X_pca[:891], label, scoring='f1', cv = 5)
np.mean(scores),np.std(scores)

(0.7748675479764392, 0.023371128453025514)

其他ae,kmeans都可以尝试...

### 二.过采样

In [6]:
from imblearn.over_sampling import SMOTE
kfold= KFold(n_splits=5,random_state=42,shuffle=True)
scores=[]
for train_index,test_index in kfold.split(poly_fea_df,label):
    X_train=poly_fea_df.loc[train_index]
    y_train=label[train_index]
    X_test=poly_fea_df.loc[test_index]
    y_test=label[test_index]
    
    X_resampled,y_resampled=SMOTE(k_neighbors=5).fit_sample(X_train,y_train)
    
    gbdt=GradientBoostingClassifier()
    gbdt.fit(X_resampled,y_resampled)
    y_predict=gbdt.predict(X_test)
    f1_score=metrics.f1_score(y_test,y_predict)
    scores.append(f1_score)
np.mean(scores),np.std(scores)

Using TensorFlow backend.


(0.7764333124477861, 0.061403220779861586)

### 三.自定义规则
对每条训练数据做如下操作：  
（1）随机删掉某个特征（0替换）；  
（2）随机交换同class的某个特征的值；  
（3）随机交换非class的某个特征的值； 

In [7]:
import copy
import random
def extend_data(train_df,train_y):
    #删除操作
    rows,cols=train_df.shape
    delete_df=copy.deepcopy(train_df)
    for i in range(0,rows):
        j=random.choice(range(0,cols))
        delete_df.iloc[i,j]=0#注意：要用iloc[i,j]的方式才能成功赋值，loc[i,j],iloc[i][j],iloc[i,j]的方式都不行
    #替换操作
    replace_df=copy.deepcopy(train_df)
    zero_class_df=train_df[train_y==0]
    one_class_df=train_df[train_y==1]
    zero_rows,_=zero_class_df.shape
    one_rows,_=one_class_df.shape
    for i in range(0,rows):
        j=random.choice(range(0,cols))
        if train_y.tolist()[i]==0:
            new_i=random.choice(range(0,zero_rows))
            replace_df.iloc[i,j]=zero_class_df.iloc[new_i,j]
        else:
            new_i=random.choice(range(0,one_rows))
            replace_df.iloc[i,j]=one_class_df.iloc[new_i,j]
    #替换操作
    replace_df2=copy.deepcopy(train_df)
    for i in range(0,rows):
        j=random.choice(range(0,cols))
        if train_y.tolist()[i]==0:
            new_i=random.choice(range(0,one_rows))
            replace_df2.iloc[i,j]=one_class_df.iloc[new_i,j]
        else:
            new_i=random.choice(range(0,zero_rows))
            replace_df2.iloc[i,j]=zero_class_df.iloc[new_i,j]
    #合并
    return pd.concat([train_df,delete_df,replace_df,replace_df2]),train_y.tolist()*4

In [8]:
kfold= KFold(n_splits=5,random_state=42,shuffle=True)
scores=[]
for train_index,test_index in kfold.split(poly_fea_df,label):
    X_train=poly_fea_df.loc[train_index]
    y_train=label[train_index]
    X_test=poly_fea_df.loc[test_index]
    y_test=label[test_index]
    
    X_extended,y_extended=extend_data(X_train,y_train)
    X_extended2,y_extended2=extend_data(X_train,y_train)
    X_extended3,y_extended3=extend_data(X_train,y_train)
    
    gbdt=GradientBoostingClassifier()
    gbdt.fit(pd.concat([X_train,X_extended,X_extended2,X_extended3]),y_train.tolist()+y_extended+y_extended2+y_extended3)
    y_predict=gbdt.predict(X_test)
    f1_score=metrics.f1_score(y_test,y_predict)
    scores.append(f1_score)
np.mean(scores),np.std(scores)

(0.772241111484664, 0.052028637093507364)