In [3]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

In [17]:
#Custom Transformer that transform columns passed as argument to its constructor 
class Cat2Transformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        self.column_mapping_dicts = {}
#        return
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        def map_dict(column):
            vc = column.value_counts()
            return {vc.index[i] : np.int(i) for i in range(len(vc.index))}
        
        self.df_transformed = pd.DataFrame(index=X.index)
        #columns subsets
        self.bin_columns = [ column for column in X.columns if column.find('bin_') != -1 ]
        self.ord_columns = [ column for column in X.columns if column.find('ord_') != -1 ]
        self.nom_columns = [ column for column in X.columns if column.find('nom_') != -1 ]
        self.dum_columns = self.nom_columns[:5]
        self.has_columns = self.nom_columns[5:]
        self.cyc_columns = ['day', 'month']

        self.column_mapping_dicts = { k: map_dict(X[k]) for k in self.bin_columns + self.ord_columns }
        for column in self.cyc_columns:
            self.column_mapping_dicts[column] = {'min': X[column].min(), 'max': X[column].max()}
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        for column in self.bin_columns:
            if X[column].dtype == np.object:
                self.df_transformed[column+'_mapped'] = X[column].replace(self.column_mapping_dicts[column])
                self.df_transformed[column+'_0'] = (self.df_transformed[column+'_mapped'] == 0).astype(np.int)
                self.df_transformed[column+'_1'] = (self.df_transformed[column+'_mapped'] == 1).astype(np.int)
                self.df_transformed.drop(column+'_mapped', axis=1, inplace=True)
            else:
                self.df_transformed[column+'_0'] = (X[column] == 0).astype(np.int)
                self.df_transformed[column+'_1'] = (X[column] == 1).astype(np.int)

        #for column in self.ord_columns:
        #    self.df_transformed[column] = X[column].replace(column_mapping_dicts[column]).fillna(-1)
            
        for column in self.cyc_columns:
            self.df_transformed[column+'_sin'] = np.sin(2*np.pi/(self.column_mapping_dicts[column]['max']+1)*X[column].fillna(self.column_mapping_dicts[column]['min']-1))
            self.df_transformed[column+'_cos'] = np.cos(2*np.pi/(self.column_mapping_dicts[column]['max']+1)*X[column].fillna(self.column_mapping_dicts[column]['min']-1))
        
        self.df_transformed = self.df_transformed.reset_index()
        print('Transform shape is {}'.format(self.df_transformed.values.shape))
        return self.df_transformed

In [40]:
class BinTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        self._binary_columns = binary_columns
        self._binary_mapping = {}
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        def map_dict(column):
            vc = column.value_counts()
            return {vc.index[i] : np.int(i) for i in range(len(vc.index))}
        
        print('Binary columns are {}'.format(self._binary_columns))
        self._df = pd.DataFrame(index=X.index)
        self._binary_mapping = { k: map_dict(X[k]) for k in self._binary_columns }
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        for column in self._binary_columns:
            if X[column].dtype == np.object:
                X[column] = X[column].replace(self._binary_mapping[column])
            self._df[column+'_0'] = (X[column] == 0).astype(np.int)
            self._df[column+'_1'] = (X[column] == 1).astype(np.int)

        
        #self._df = self._df.reset_index(drop=True)
        print('Transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [10]:
pd_train = pd.read_csv('train.csv')
pd_train[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].head()

Unnamed: 0,nom_5,nom_6,nom_7,nom_8,nom_9
0,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990
1,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af
2,b574c9841,708248125,5ddc9a726,745b909d1,
3,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57
4,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c


In [4]:
pd_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 25 columns):
id        600000 non-null int64
bin_0     582106 non-null float64
bin_1     581997 non-null float64
bin_2     582070 non-null float64
bin_3     581986 non-null object
bin_4     581953 non-null object
nom_0     581748 non-null object
nom_1     581844 non-null object
nom_2     581965 non-null object
nom_3     581879 non-null object
nom_4     581965 non-null object
nom_5     582222 non-null object
nom_6     581869 non-null object
nom_7     581997 non-null object
nom_8     582245 non-null object
nom_9     581927 non-null object
ord_0     581712 non-null float64
ord_1     581959 non-null object
ord_2     581925 non-null object
ord_3     582084 non-null object
ord_4     582070 non-null object
ord_5     582287 non-null object
day       582048 non-null float64
month     582012 non-null float64
target    600000 non-null int64
dtypes: float64(6), int64(2), object(17)
memory usage: 114.4

In [180]:
tr2 = Cat2Transformator()
df_1 = tr2.fit_transform(pd_train)
df_1#.describe()

array([[ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
        -1.83697020e-16,  9.92708874e-01,  1.20536680e-01],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         7.07106781e-01, -2.39315664e-01, -9.70941817e-01],
       [ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -7.07106781e-01, -9.35016243e-01, -3.54604887e-01],
       ...,
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         7.07106781e-01,  6.63122658e-01, -7.48510748e-01],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         7.07106781e-01,  6.63122658e-01, -7.48510748e-01],
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
        -7.07106781e-01, -6.63122658e-01, -7.48510748e-01]])

In [182]:
#df_1.columns[df_1.isna().any()].tolist()
np.any(np.isnan(df_1))
#df_1.info()

False

In [169]:
pd_train['target'].value_counts(dropna=False)

0    487677
1    112323
Name: target, dtype: int64

In [160]:
for i in range(8): print('{} \t {:0.3f} \t {:0.3f}'.format(i, np.sin(2*np.pi/8*i),np.cos(2*np.pi/8*i)))

0 	 0.000 	 1.000
1 	 0.707 	 0.707
2 	 1.000 	 0.000
3 	 0.707 	 -0.707
4 	 0.000 	 -1.000
5 	 -0.707 	 -0.707
6 	 -1.000 	 -0.000
7 	 -0.707 	 0.707


In [30]:
df_ord35 = pd_train[['ord_3', 'ord_4', 'ord_5']].head(50)
df_ord35['ord_3_int'] = df_ord35['ord_3'].fillna(chr(0)).apply(lambda x: ord(x))
df_ord35['ord_5_1'] = df_ord35['ord_5'].fillna(chr(0)+chr(0)).apply(lambda x: ord(x[0]))
df_ord35['ord_5_2'] = df_ord35['ord_5'].fillna(chr(0)+chr(0)).apply(lambda x: ord(x[1]))
df_ord35

Unnamed: 0,ord_3,ord_4,ord_5,ord_3_int,ord_5_1,ord_5_2
0,c,U,Pw,99,80,119
1,e,X,pE,101,112,69
2,n,P,eN,110,101,78
3,a,C,,97,0,0
4,h,C,OZ,104,79,90
5,b,Q,wa,98,119,97
6,c,R,rg,99,114,103
7,b,Y,PS,98,80,83
8,c,N,mX,99,109,88
9,n,I,OZ,110,79,90


In [15]:
df_nom59 = pd_train[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].head()
df_nom59['all_noms'] = df_nom59[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].fillna('').agg(''.join, axis=1)
df_nom59

Unnamed: 0,nom_5,nom_6,nom_7,nom_8,nom_9,all_noms
0,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,de4c57ee2a64bc7ddf598080a910256c7a4b02e7c8990
1,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,2bb3c3e5c3a3a936e81dddb847352ead350cf37df64af
2,b574c9841,708248125,5ddc9a726,745b909d1,,b574c98417082481255ddc9a726745b909d1
3,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,673bdf1f623edb8da33a33ef960bdaa56dd1f9d456e57
4,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,777d1ac2c3a7975e46bc9cc2a94c5361037c


In [19]:
'de4c57ee2a64bc7ddf598080a910256c7a4b02e7c8990'.count('x')

0

In [20]:
hex_symbols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']

for s in hex_symbols:
    df_nom59['nom_hex_'+s] = df_nom59['all_noms'].apply(lambda x : x.count(s))

df_nom59

Unnamed: 0,nom_5,nom_6,nom_7,nom_8,nom_9,all_noms,nom_hex_0,nom_hex_1,nom_hex_2,nom_hex_3,...,nom_hex_6,nom_hex_7,nom_hex_8,nom_hex_9,nom_hex_a,nom_hex_b,nom_hex_c,nom_hex_d,nom_hex_e,nom_hex_f
0,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,de4c57ee2a64bc7ddf598080a910256c7a4b02e7c8990,5,1,3,0,...,2,4,3,4,3,2,4,3,4,1
1,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,2bb3c3e5c3a3a936e81dddb847352ead350cf37df64af,1,1,2,8,...,2,2,2,1,4,3,3,5,3,3
2,b574c9841,708248125,5ddc9a726,745b909d1,,b574c98417082481255ddc9a726745b909d1,2,3,3,0,...,1,4,3,4,1,2,2,3,0,0
3,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,673bdf1f623edb8da33a33ef960bdaa56dd1f9d456e57,1,2,1,6,...,5,2,1,2,4,3,0,7,3,4
4,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,777d1ac2c3a7975e46bc9cc2a94c5361037c,1,2,2,3,...,2,6,0,3,3,1,7,1,1,0


In [41]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

bin_columns = [ column for column in pd_train.columns if column.find('bin_') != -1 ]
X_train = pd_train.drop('target', axis=1)
y_train = pd_train['target'].values
print(X_train.shape)
print(y_train.shape)
cat2_pipeline = Pipeline( 
                          steps = [ ( 'cat2_transform', BinTransformator() ),                          
                                    ( 'std_scaler', StandardScaler() ), 
                                    ( 'forest', RandomForestClassifier() ) 
                                  ] 
                        )

#cat2_pipeline.fit(X_train, y_train)

scores = cross_val_score(cat2_pipeline, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std Deviation:", scores.std())
display_scores(tree_rmse_scores)



(600000, 24)
(600000,)
Binary columns are ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Transform columns is Index(['bin_0_0', 'bin_0_1', 'bin_1_0', 'bin_1_1', 'bin_2_0', 'bin_2_1',
       'bin_3_0', 'bin_3_1', 'bin_4_0', 'bin_4_1'],
      dtype='object')
Transform shape is (540000, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Transform columns is Index(['bin_0_0', 'bin_0_1', 'bin_1_0', 'bin_1_1', 'bin_2_0', 'bin_2_1',
       'bin_3_0', 'bin_3_1', 'bin_4_0', 'bin_4_1'],
      dtype='object')
Transform shape is (540000, 10)


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
'''
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'forest__bootstrap': [True, False],
    'forest__n_estimators': [10, 20, 50, 100],
}

grid = GridSearchCV(cat2_pipeline, cv=5, param_grid=param_grid)
grid.fit(X_train,y_train)

# summarize results
print("Best: %f using %s" % (grid.best_score_, 
    grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
'''

In [34]:
class BinaryTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        self._binary_mapping = {'bin_3': {'F':0, 'T':1}, 'bin_4': {'N':0, 'Y':1}}
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        self._bin_columns = [ column for column in X.columns if column.find('bin_') != -1 ]
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        for column in self._bin_columns:
            if X[column].dtype == np.object:
                X[column] = X[column].replace(self._binary_mapping[column])
            self._df[column+'_0'] = (X[column] == 0).astype(np.int)
            self._df[column+'_1'] = (X[column] == 1).astype(np.int)

        #self._df = self._df.reset_index(drop=True)
        print('Binary transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [35]:
class NominalTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        return
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        hex_symbols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']

        self._df = pd.DataFrame(index=X.index)
        # from nom_0 to nom_4
        for i in range(5):
            categories = X['nom_{}'.format(i)].value_counts().index
            for category in categories:
                self._df['nom_{}_{}'.format(i, category)] = (X['nom_{}'.format(i)] == category).astype(np.int)
        
        X['noms59'] = X[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].fillna('').agg(''.join, axis=1)
        for s in hex_symbols:
            self._df['nom_hex_'+s] = X['noms59'].apply(lambda x : x.count(s))

        #self._df = self._df.reset_index(drop=True)
        print('Nominal transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [36]:
class OrdinaryTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        self._ordinary_mapping = {
                                      'ord_1': {'Novice':1, 'Expert':2, 'Contributor':3, 'Master':4, 'Grandmaster':5}, 
                                      'ord_2': {'Freezing':1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot':5, 'Lava Hot':6}
                                 }
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        self._df['ord_0'] = X['ord_0'].fillna(0)
        self._df['ord_1'] = X['ord_1'].replace({'Novice':1, 'Expert':2, 'Contributor':3, 'Master':4, 'Grandmaster':5}).fillna(0)
        self._df['ord_2'] = X['ord_2'].replace({'Freezing':1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot':5, 'Lava Hot':6}).fillna(0)
        self._df['ord_3'] = X['ord_3'].fillna(chr(0)).apply(lambda x: ord(x))
        self._df['ord_4'] = X['ord_4'].fillna(chr(0)).apply(lambda x: ord(x))
        self._df['ord_5_1'] = X['ord_5'].fillna(chr(0)+chr(0)).apply(lambda x: ord(x[0]))
        self._df['ord_5_2'] = X['ord_5'].fillna(chr(0)+chr(0)).apply(lambda x: ord(x[1]))

        #self._df = self._df.reset_index(drop=True)
        print('Ordinary transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [47]:
class CycleTransformator( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        self._cycle_columns = ['day', 'month']
        self._cycle_stats = {}
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        
        for column in self._cycle_columns:
            self._cycle_stats[column] = {'max': X[column].max(), 'min': X[column].min()}
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        for column in self._cycle_columns:
            self._df[column+'_sin'] = np.sin(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))
            self._df[column+'_cos'] = np.cos(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))

        #self._df = self._df.reset_index(drop=True)
        print('Cycle transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [52]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

pd_train = pd.read_csv('train.csv', index_col='id')
pd_test = pd.read_csv('test.csv', index_col='id')
X_train = pd_train.drop('target', axis=1)
X_test = pd_test
y_train = pd_train['target'].values
print(X_train.shape)
print(y_train.shape)

data_pipeline = FeatureUnion( transformer_list = [ 
                                                   ( 'binary', BinaryTransformator() ), 
                                                   ( 'ordinary', OrdinaryTransformator() ), 
                                                   ( 'nominal', NominalTransformator() ), 
                                                   ( 'cycle', CycleTransformator() ), 
                                                 ]
                            )

#model_pipeline = FeatureUnion( steps
                         )
cat2_pipeline = Pipeline( 
                          steps = [ ( 'data', data_pipeline ),                          
                                    ( 'std_scaler', StandardScaler() ), 
                                    ( 'logreg', LogisticRegressionCV(random_state=579,max_iter=500) ) 
                                  ] 
                        )
cat2_pipeline.fit( X_train, y_train )
X_test['target'] = cat2_pipeline.predict_proba( X_test )[:,1]
#X_test.to_csv('prediction_rf03.csv', columns=['target'])

(600000, 23)
(600000,)
Binary transform shape is (600000, 10)
Ordinary transform shape is (600000, 7)
Nominal transform shape is (600000, 41)
Cycle transform shape is (600000, 4)
Binary transform shape is (400000, 10)
Ordinary transform shape is (400000, 7)
Nominal transform shape is (400000, 41)
Cycle transform shape is (400000, 4)


In [53]:
X_test.to_csv('prediction_rf05.csv', columns=['target'], float_format='%.1f')