In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
class BinaryTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        self._binary_mapping = {'bin_3': {'F':0, 'T':1}, 'bin_4': {'N':0, 'Y':1}}
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        self._bin_columns = [ column for column in X.columns if column.find('bin_') != -1 ]
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        for column in self._bin_columns:
            if X[column].dtype == np.object:
                X[column] = X[column].replace(self._binary_mapping[column])
#            X[column] = X[column].fillna(-1)
            self._df[column+'_0'] = (X[column] == 0).astype(np.int)
            self._df[column+'_1'] = (X[column] == 1).astype(np.int)
#            self._df[column+'_0'] = X[column].apply(lambda x: -1 if x == -1 else int(x==0))
#            self._df[column+'_1'] = X[column].apply(lambda x: -1 if x == -1 else int(x==1))

        #self._df = self._df.reset_index(drop=True)
        print('Binary transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [3]:
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelEncoder

class NominalTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        #self._le = LabelEncoder()
        return
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        #self._nom_columns = [ column for column in X.columns if column.find('nom_') != -1 ]
        self._n_features = np.ceil(np.log(len(X['nom_9'].value_counts().index))).astype(np.int)
        #self._n_features = 15
        #self._le.fit(X[self._nom_columns])
        #for i in range(5,10):
        #    self._nom_hash_features['nom_{}'.format(i)] = np.log(len(X['nom_{}'.format(i)].value_counts().index))
        #    print('nom_{} - {}'.format(i, self._nom_hash_features['nom_{}'.format(i)]))
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        hex_symbols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']

        self._df = pd.DataFrame(index=X.index)
        # from nom_0 to nom_4
        for i in range(5):
            categories = X['nom_{}'.format(i)].value_counts().index
            for category in categories:
                self._df['nom_{}_{}'.format(i, category)] = (X['nom_{}'.format(i)] == category).astype(np.int)
        
# spresd characters
#        X['noms59'] = X[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].fillna('').agg(''.join, axis=1)
#        for s in hex_symbols:
#            self._df['nom_hex_'+s] = X['noms59'].apply(lambda x : x.count(s))
        result = self._df.values
        # feature hash
        fh = FeatureHasher(n_features=self._n_features, input_type='string')
        for i in range(5,10):
            hashed_features = fh.fit_transform(X['nom_{}'.format(i)].fillna('0000000'))
            result = np.hstack((result,hashed_features.toarray()))
            #self._df['nom_{}'.format(i)] = X['nom_{}'.format(i)].fillna('0').apply(lambda x : int(x,16))
        
        #self._df = self._df.reset_index(drop=True)
        print('Nominal transform shape is {}'.format(result.shape))
        return result

In [4]:
class OrdinaryTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        self._ordinary_mapping = {
                                      'ord_1': {'Novice':1, 'Expert':2, 'Contributor':3, 'Master':4, 'Grandmaster':5}, 
                                      'ord_2': {'Freezing':1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot':5, 'Lava Hot':6}
                                 }
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        self._df['missing_data'] = X.isna().sum(axis=1)
        self._df['ord_0'] = X['ord_0'].fillna(0)
        self._df['ord_1'] = X['ord_1'].replace({'Novice':1, 'Expert':2, 'Contributor':3, 'Master':4, 'Grandmaster':5}).fillna(0)
        self._df['ord_2'] = X['ord_2'].replace({'Freezing':1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot':5, 'Lava Hot':6}).fillna(0)

        self._df['ord_3'] = X['ord_3'].fillna('0').apply(lambda x: ord(x))
        self._df['ord_4'] = X['ord_4'].fillna('0').apply(lambda x: ord(x))
        self._df['ord_5'] = X['ord_5'].fillna('0').apply(lambda x: 0 if x=='0' else ord(x[0])*10+ord(x[1]))
#        self._df['ord_5_1'] = X['ord_5'].fillna('00').apply(lambda x: 0 if x=='00' else ord(x[0]))
#        self._df['ord_5_2'] = X['ord_5'].fillna('00').apply(lambda x: 0 if x=='00' else ord(x[1]))

        '''
        side coding
        self._df['ord_3'] = X['ord_3'].fillna('0').apply(lambda x: -1 if x=='0' else ord(x))
        self._df['ord_3'] = self._df['ord_3'].apply(lambda x: -1 if x == 0 else x )
        self._df['ord_4'] = X['ord_4'].fillna('0').apply(lambda x:  -1 if x=='0' else ord(x))
        self._df['ord_4'] = self._df['ord_4'].apply(lambda x: -1 if x == 0 else x )
        self._df['ord_5_1'] = X['ord_5'].fillna('00').apply(lambda x: -1 if x=='00' else ord(x[0]))
        self._df['ord_5_2'] = X['ord_5'].fillna('00').apply(lambda x: -1 if x=='00' else ord(x[1]))
        '''

        #self._df = self._df.reset_index(drop=True)
        print('Ordinary transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [5]:
class CycleTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        self._cycle_columns = ['day', 'month']
        self._cycle_stats = {}
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        
        for column in self._cycle_columns:
            self._cycle_stats[column] = {'max': X[column].max(), 'min': X[column].min()}
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        for column in self._cycle_columns:
            self._df[column+'_sin'] = np.sin(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))
            self._df[column+'_cos'] = np.cos(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))

        #self._df = self._df.reset_index(drop=True)
        print('Cycle transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [None]:
%%time
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std Deviation:", scores.std())

pd_train = pd.read_csv('train.csv', index_col='id')
pd_test = pd.read_csv('test.csv', index_col='id')
X_train = pd_train.drop('target', axis=1)
X_test = pd_test
y_train = pd_train['target'].values
print(X_train.shape)
print(y_train.shape)

data_pipeline = FeatureUnion( transformer_list = [ 
                                                   ( 'binary', BinaryTransformator() ), 
                                                   ( 'ordinary', OrdinaryTransformator() ), 
                                                   ( 'nominal', NominalTransformator() ), 
                                                   ( 'cycle', CycleTransformator() ), 
                                                 ]
                            )

#model_pipeline = FeatureUnion( steps
#                         )
cat2_pipeline = Pipeline( 
                          steps = [ ( 'data', data_pipeline ),                          
                                    ( 'std_scaler', StandardScaler() ), 
                                    ( 'bagging', BaggingClassifier(LinearSVC(), n_estimators=200, max_samples=0.3, max_features=0.3, n_jobs=2,random_state=579, verbose=2)  )
#                                    ( 'forest', RandomForestClassifier(n_estimators=500, max_samples=0.2 ,max_features=0.5, n_jobs=2,random_state=579, verbose=1)  )
                                  ] 
                        )
#cat2_pipeline.fit( X_train, y_train )
scores = cross_val_score(cat2_pipeline, X_train, y_train, scoring="roc_auc", cv=5, verbose=1)

#X_test.to_csv('prediction_rf03.csv', columns=['target'])

(600000, 23)
(600000,)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Binary transform shape is (480000, 10)
Ordinary transform shape is (480000, 7)
Nominal transform shape is (480000, 65)
Cycle transform shape is (480000, 4)


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
display_scores(scores)

In [None]:
cat2_pipeline.fit( X_train, y_train )
X_test['target'] = cat2_pipeline.predict_proba( X_test )[:,1]
X_test.to_csv('prediction_bag03.csv', columns=['target'])#, float_format='%.1f')