In [1]:
import numpy as np
import pandas as pd
import os
import time
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
path_folder=os.path.normcase(r'C:\Users\dell\Downloads\Titanic-maching-learning-from-disaster')
train_data=pd.read_csv(os.path.join(path_folder,'train.csv'),sep=',')
test_data_origin=pd.read_csv(os.path.join(path_folder,'test.csv'),sep=',')

In [3]:
def deal_ticket_fare(df):
    '''
    计算每张票有几人共享，以及每人花费的船票钱
    
    paramters:
        df--dataframe，待处理的数据表
        
    return:
        df_count--处理后，添加新列的数据表
    '''
    num_of_tickets=df[['Ticket']].groupby(df['Ticket']).count()
    num_of_tickets.columns=['num_of_tickets']
    df_count=df.merge(num_of_tickets,left_on='Ticket',right_index=True,how='left')
    df_count['fare_per_ticket']=df_count['Fare']/df_count['num_of_tickets']
    return df_count

In [4]:
# 构造 数据预处理 流水线
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator,TransformerMixin

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attrs_name_list):
        self.attrs_name_list=attrs_name_list
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attrs_name_list].values

def data_preparation(cat_attributes,interval_num_attributes,ratio_num_attributes):
    '''
    构造处理DataFrame 的 类
    
    Arguments:
        cat_attributes ---         标称数据列集合
        interval_num_attributes--- 标度数值列集合/不需进行标准化的列集合
        ratio_num_attributes   --- 比例数值列集合/需进行 标准化 列集合
       
    Return 
        full_pipeline---类class
    '''
    cat_attrs=cat_attributes    
    interval_num_attrs=interval_num_attributes
    ratio_num_attrs=ratio_num_attributes
    transformer_list=[]
    
    if cat_attrs:
        cat_pipeline=Pipeline([('cat_dfs',DataFrameSelector(cat_attrs)),('impute',SimpleImputer(strategy='most_frequent')),\
                               ('onehotencoder',OneHotEncoder())])
        transformer_list.append(('cat_pipeline',cat_pipeline))
        
    if interval_num_attrs:
        interval_num_pipeline=Pipeline([('dfs',DataFrameSelector(interval_num_attrs)),('impute',SimpleImputer(strategy='median')) ])
        transformer_list.append(('interval_num_pipeline',interval_num_pipeline))
        
    if ratio_num_attrs:  
        ratio_num_pipeline=Pipeline([('dfs',DataFrameSelector(ratio_num_attrs)),('impute',SimpleImputer(strategy='median')),\
                                     ('std_scaler',StandardScaler())])
        transformer_list.append(('ratio_num_pipeline',ratio_num_pipeline))
    full_pipeline=FeatureUnion(transformer_list=transformer_list)
    return full_pipeline

In [5]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 10)):
    from sklearn.model_selection import learning_curve
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)#train_scores size (n_ticks,n_cv_folds)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 's-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [6]:
# 计算 预测准确率
def compute_acc(y,y_pred):
    y_pred_class=np.where(y_pred>0.5,1,0)
    pred_accuracy=(y==y_pred_class).sum()/len(y)    
    return pred_accuracy

In [7]:
# scale 数值数据 数据预处理
train_data_count=deal_ticket_fare(train_data)
from sklearn.model_selection import train_test_split
#train_data_index,dev_data_index=train_test_split(train_data_count.index.tolist(),train_size=0.7,test_size=0.3,random_state=42)
#train_df_data=train_data_count.iloc[train_data_index,:]
#dev_df_data=train_data_count.iloc[dev_data_index,:]
train_df_data,dev_df_data=train_test_split(train_data_count,train_size=0.7,test_size=0.3,random_state=42)
train_data_index=train_df_data.index.tolist()
dev_data_index=dev_df_data.index.tolist()
cat_attrs=['Sex','Embarked']
interval_num_attrs=['Pclass','SibSp','Parch','num_of_tickets']
ratio_num_attrs=['Age','Fare','fare_per_ticket']
full_pipeline_std=data_preparation(cat_attrs,interval_num_attrs,ratio_num_attrs)
train_data_X=full_pipeline_std.fit_transform(train_df_data).toarray()
train_data_y=train_df_data['Survived'].values.ravel()
train_data_X[0]

array([ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        1.        ,  0.        ,  2.        ,  1.        , -1.91971935,
        0.98099823,  3.53619915])

In [8]:
dev_data_X=full_pipeline_std.transform(dev_df_data).toarray()
dev_data_y=dev_df_data['Survived'].values.ravel()
dev_data_X[0]

array([ 0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
        3.        ,  1.        ,  1.        ,  2.        , -0.0772525 ,
       -0.32547755, -0.49449441])