In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

In [2]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")
sub_orig = pd.read_csv("../input/sample_submission.csv", index_col = 0)

In [3]:
target = train_data.pop('Target')

In [4]:
train_data.drop(['Id'], axis=1, inplace=True)
test_id = test_data.pop('Id')

In [5]:
train_data.shape

(9557, 141)

In [6]:
test_data.shape

(23856, 141)

In [7]:
class MissingValuesImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, impute_zero_columns):
        self.impute_zero_columns = impute_zero_columns
        
    def fit(self, X, y = None):
        print("Mean Values Imputer")
        return self
    
    def transform(self, X, y = None):
        
        # Fill missing values for v18q1, v2a1 and rez_esc
        for column in self.impute_zero_columns:
            X[column] = X[column].fillna(0)

        # For meaneduc we use the average schooling of household adults
        self.X_with_meaneduc_na = X[pd.isnull(X['meaneduc'])]
        self.mean_escolari_dict = dict(self.X_with_meaneduc_na.groupby('idhogar')['escolari'].apply(np.mean))
        for row_index in self.X_with_meaneduc_na.index:
            row_idhogar = X.at[row_index, 'idhogar']
            X.at[row_index, 'meaneduc'] = self.mean_escolari_dict[row_idhogar]
            X.at[row_index, 'SQBmeaned'] = np.square(self.mean_escolari_dict[row_idhogar])
        return X

In [8]:
class RemoveObjectTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.target = ['dependency']
        self.source = ['SQBdependency']
        
    def fit(self, X, y = None):
        print("Remove Object Imputer")
        return self
    
    def transform(self, X, y = None):
        for i in range(0, len(self.target)):
            X[self.target[i]] = np.sqrt(X[self.source[i]])
            X.drop(self.source, axis=1, inplace=True)
        return X

In [9]:
def calculate_edu(row):
    if (row['edjefe'] == 'yes' and row['edjefa'] == 'no') or (row['edjefe'] == 'no' and row['edjefa'] == 'yes'):
        return 1
    if row['edjefe'] == 'no' and row['edjefa'] == 'no':
        return 0
    if row['edjefe'] == 'yes' or row['edjefe'] == 'no':
        return pd.to_numeric(row['edjefa'])
    return pd.to_numeric(row['edjefe'])


class CategoricalVariableTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        print("Categorical Variables Transformer")
        return self
    
    def transform(self, X, y = None):
        X['house_holder_edu'] = X.apply(calculate_edu, axis=1).values
        X.drop(['edjefe', 'edjefa'], axis=1, inplace=True)
        return X

In [10]:
class UnnecessaryColumnsRemoverTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, axis = 1):
        print("Unnecessary Columns Remover Transformer")
        self.axis = axis
        self.unnecessary_columns = [
            'r4t3', 'tamhog', 'tamviv', 'hogar_total', 'v18q', 'v14a', 'agesq',
            'mobilephone', 'energcocinar1', 'sanitario6',
            'estadocivil7', 'lugar1', 'area1', 'female'
        ]
        
    def fit(self, X, y = None):
        unnecessary_columns_to_extend = [
            [col for col in X.columns.tolist() if 'SQB' in col],
            [col for col in X.columns.tolist() if 'epared' in col],
            [col for col in X.columns.tolist() if 'etecho' in col],
            [col for col in X.columns.tolist() if 'eviv' in col],
            [col for col in X.columns.tolist() if 'instlevel' in col],
            [col for col in X.columns.tolist() if 'pared' in col],
            [col for col in X.columns.tolist() if 'piso' in col],
            [col for col in X.columns.tolist() if 'techo' in col],
            [col for col in X.columns.tolist() if 'abastagua' in col],
            [col for col in X.columns.tolist() if 'elimbasu' in col],
            [col for col in X.columns.tolist() if 'tipoviv' in col]
        ]
        
        for col_list in unnecessary_columns_to_extend:
            self.unnecessary_columns.extend(col_list)
        return self
    
    def transform(self, X, y = None):
        X = X.drop(self.unnecessary_columns, axis = self.axis)
        return X

In [11]:
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, axis = 1):
        self.axis = axis
        
        # individual level boolean features
        self.individual_boolean_features = [
            'dis', 'male', 'estadocivil1', 'estadocivil2',
            'estadocivil3', 'estadocivil4', 'estadocivil5', 'estadocivil6', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 
            'parentesco5', 'parentesco6', 'parentesco7', 'parentesco8',  
            'parentesco9', 'parentesco10', 'parentesco11'
        ]

        # individual level ordered features
        self.individual_ordered_features = ['escolari', 'age']
        
    def fit(self, X, y = None):
        print("Feature Engineering Transformer")
        self.more_columns_to_drop = [
            [col for col in X.columns.tolist() if 'parentesco' in col and 'parentesco1' not in col],
            ['idhogar']
        ]
        
        f = lambda x: x.std(ddof = 0)
        f.__name__ = 'std_0'
        self.aggregate_features = (['mean', 'max', 'min', 'sum', f])
        return self
    
    def transform(self, X, y = None):
        # Rooms
        X['rent_per_room'] = X['v2a1' ] / X['rooms']
        X['adults_per_room'] = X['hogar_adul'] / X['rooms']
        X['males_per_room'] = X['r4h3'] / X['rooms']
        X['females_per_room'] = X['r4m3'] / X['rooms']
        X['children_per_room'] = X['hogar_nin'] / X['rooms']
        X['humans_per_room'] = X['hhsize'] / X['rooms']
        X['beds_per_room'] = X['bedrooms'] / X['rooms']
        
        # Bedroom
        X['adults_per_bedroom'] = X['hogar_adul'] / X['bedrooms']
        X['males_per_bedroom'] = X['r4h3'] / X['bedrooms']
        X['females_per_bedroom'] = X['r4m3'] / X['bedrooms']
        X['children_per_bedroom'] = X['hogar_nin'] / X['bedrooms']
        X['humans_per_bedroom'] = X['hhsize'] / X['bedrooms']
        
        X['persons12less_fraction'] = (X['r4h1'] + X['r4m1']) / X['hhsize']
        X['males12plus_fraction'] = X['r4h2'] / X['hhsize']
        X['total_males_fraction'] = X['r4h3'] / X['hhsize']
        X['females12plus_fraction'] = X['r4m2'] / X['hhsize']
        X['all_females_fraction'] = X['r4m3'] / X['hhsize']
        X['rent_per_person'] = X['v2a1'] / X['hhsize']
        X['mobiles_per_person'] = X['qmobilephone'] / X['hhsize']
        X['tablets_per_person'] = X['v18q1'] / X['hhsize']
        X['mobiles_per_male'] = X['qmobilephone'] / X['r4h3']
        X['tablets_per_male'] = X['v18q1'] / X['r4h3']
        
        # Create individual-level features
        grouped_df = X.groupby('idhogar')[self.individual_boolean_features + self.individual_ordered_features]
        grouped_df = grouped_df.agg(self.aggregate_features)
        X = X.join(grouped_df, on = 'idhogar')
        
        # Finally remove the other parentesco columns since we are only going to use only heads of
        # households for our scoring
        for col in self.more_columns_to_drop:
            X = X.drop(col, axis = self.axis) 
        
        return X

In [12]:
class LGBClassifierCV(BaseEstimator, RegressorMixin):
    
    def __init__(self, axis = 0, lgb_params = None, fit_params = None, cv = 3, perform_random_search = False, use_train_test_split = False, use_kfold_split = True):
        self.axis = axis
        self.lgb_params = lgb_params
        self.fit_params = fit_params
        self.cv = cv
        self.perform_random_search = perform_random_search
        self.use_train_test_split = use_train_test_split
        self.use_kfold_split = use_kfold_split
    
    @property
    def feature_importances_(self):
        feature_importances = []
        for estimator in self.estimators_:
            feature_importances.append(
                estimator.feature_importances_
            )
        return np.mean(feature_importances, axis = 0)
    
    @property
    def evals_result_(self):
        evals_result = []
        for estimator in self.estimators_:
            evals_result.append(
                estimator.evals_result_
            )
        return np.array(evals_result)
    
    @property
    def best_scores_(self):
        best_scores = []
        for estimator in self.estimators_:
            best_scores.append(
                estimator.best_score_['validation']['macroF1']
            )
        return np.array(best_scores)
    
    @property
    def cv_scores_(self):
        return self.best_scores_ 
    
    @property
    def cv_score_(self):
        return np.mean(self.best_scores_)
    
    @property
    def best_iterations_(self):
        best_iterations = []
        for estimator in self.estimators_:
            best_iterations.append(
                estimator.best_iteration_
            )
        return np.array(best_iterations)
    
    @property
    def best_iteration_(self):
        return np.round(np.mean(self.best_iterations_))

    def find_best_params_(self, X, y):
        
        # Define a search space for the parameters
        lgb_search_params = {
                  'num_leaves': sp_randint(20, 100), 
                  'min_child_samples': sp_randint(40, 100), 
                  'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
                  'subsample': sp_uniform(loc = 0.75, scale = 0.25), 
                  'colsample_bytree': sp_uniform(loc = 0.8, scale = 0.15),
                  'reg_alpha': [0, 1e-3, 1e-1, 1, 10, 50, 100],
                  'reg_lambda': [0, 1e-3, 1e-1, 1, 10, 50, 100]
            }

        x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.10, random_state = 42, stratify = y)
        F1_scorer = make_scorer(f1_score, greater_is_better = True, average = 'macro')

        lgb_model = lgb.LGBMClassifier(**self.lgb_params)
        self.fit_params["eval_set"] = [(x_train, y_train), (x_val, y_val)]
        self.fit_params["verbose"] = 200

        rs = RandomizedSearchCV(estimator = lgb_model, 
                                param_distributions = lgb_search_params, 
                                n_iter = 100,
                                scoring = F1_scorer,
                                cv = 5,
                                refit = True,
                                random_state = 314,
                                verbose = False,
                                fit_params = self.fit_params)
        
        # Fit the random search
        _ = rs.fit(x_train, y_train)
        
        print("Optimal LGB parameters:")
        print(rs.best_params_)
        with open("lgb_best_params.pickle", "wb") as lgb_best_params:
            pickle.dump(rs.best_params_, lgb_best_params)
        
        return rs.best_params_
    
    def fit(self, X, y, **fit_params):
        print("LGBClassifierCV")
        
        # Use only heads of households for scoring
        X.insert(0, 'Target', y)
        X = X.query('parentesco1 == 1')
        y = X['Target'] - 1
        X = X.drop(['Target', 'parentesco1'], 1)
        print("Number of columns in train - " + str(X.shape[1]))
        
        self.estimators_ = []
        
        # Use the best parameters to fit a model to whole data
        if self.perform_random_search:
            self.lgb_optimal_params = self.find_best_params_(X, y)
            
        # Use a simple train-test split. I have found that this gives a better local CV score than
        # K folds.
        if self.use_train_test_split:
            x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state = 0)
            
            lgb_model = lgb.LGBMClassifier(**self.lgb_params)
            if self.perform_random_search:
                lgb_model.set_params(**self.lgb_optimal_params)
            
            lgb_model.fit(
                    x_train, y_train,
                    eval_set = [(x_train, y_train), (x_val, y_val)],
                    **self.fit_params
            )
            print("Train F1 - " + str(lgb_model.best_score_['train']['macroF1']) + "   " + "Validation F1 - " + str(lgb_model.best_score_['validation']['macroF1']))
            self.estimators_.append(lgb_model)
            
        # When not using random search to tune parameters, proceed with a simple Stratified Kfold CV
        if self.use_kfold_split:
            kf = StratifiedKFold(n_splits = self.cv, shuffle = True)
            for fold_index, (train, valid) in enumerate(kf.split(X, y)):
                print("Train Fold Index - " + str(fold_index))

                lgb_model = lgb.LGBMClassifier(**self.lgb_params)
                if self.perform_random_search:
                    lgb_model.set_params(**self.lgb_optimal_params)

                lgb_model.fit(
                        X.iloc[train], y.iloc[train],
                        eval_set = [(X.iloc[train], y.iloc[train]), (X.iloc[valid], y.iloc[valid])],
                        **self.fit_params
                )
                print("Train F1 - " + str(lgb_model.best_score_['train']['macroF1']) + "   " + "Validation F1 - " + str(lgb_model.best_score_['validation']['macroF1']))

                self.estimators_.append(lgb_model)
        return self
    
    def predict(self, X):
        # Remove this column since we are using only heads of households for scoring
        X = X.drop('parentesco1', 1)
        
        # When not using random search, use voting to get predictions from all CV estimators.
        y_pred = []
        for estimator_index, estimator in enumerate(self.estimators_):
            print("Estimator Index - " + str(estimator_index))
            y_pred.append(estimator.predict(X))
        return np.mean(y_pred, axis = self.axis).astype(int)

In [13]:
def get_lgb_params():
    
    def evaluate_macroF1_lgb(truth, predictions):  
        pred_labels = predictions.reshape(len(np.unique(truth)), -1).argmax(axis = 0)
        f1 = f1_score(truth, pred_labels, average = 'macro')
        return ('macroF1', f1, True)

    def learning_rate_power_0997(current_iter):
            base_learning_rate = 0.1
            min_learning_rate = 0.02
            lr = base_learning_rate  * np.power(.995, current_iter)
            return max(lr, min_learning_rate)

    lgb_params = {'boosting_type': 'dart',
                  'class_weight': 'balanced',
                  "objective": 'multiclassova',
                  'metric': None,
                  'silent': True,
                  'random_state': 0,
                  'n_jobs': -1}

    fit_params={"early_stopping_rounds": 400, 
                "eval_metric" : evaluate_macroF1_lgb, 
                'eval_names': ['train', 'validation'],
                'verbose': False,
                'categorical_feature': 'auto'}
    
    return lgb_params, fit_params

In [14]:
lgb_params, lgb_fit_params = get_lgb_params()

pipeline = Pipeline([
    ('na_imputer', MissingValuesImputer(impute_zero_columns = ['v18q1', 'v2a1', 'rez_esc'])),
    ('remove_imputer', RemoveObjectTransformer()),
    ('cat_transformer', CategoricalVariableTransformer()),
    ('unnecessary_columns_remover_transformer', UnnecessaryColumnsRemoverTransformer()),
    ('feature_engineering_transformer', FeatureEngineeringTransformer()),
    ('lgb', LGBClassifierCV(lgb_params = lgb_params,
                            fit_params = lgb_fit_params,
                            cv = 5,
                            perform_random_search = False,
                            use_train_test_split = True,
                            use_kfold_split = False)
    )
])


pipeline.fit(train_data.copy(), target.copy())
pred = pipeline.predict(test_data.copy())
print("Local CV Score - " + str(pipeline.named_steps['lgb'].cv_score_))
sub_orig['Target'] = pred + 1
sub_orig.to_csv('Pipeline_Base_LGB_'+ str(pipeline.named_steps['lgb'].cv_score_) + '.csv')
print(sub_orig.head())

Unnecessary Columns Remover Transformer
Mean Values Imputer
Remove Object Imputer
Categorical Variables Transformer
Feature Engineering Transformer




LGBClassifierCV
Number of columns in train - 185
Train F1 - 0.866419406663   Validation F1 - 0.45010643404




Estimator Index - 0
Local CV Score - 0.45010643404
              Target
Id                  
ID_2f6873615       4
ID_1c78846d2       4
ID_e5442cf6a       4
ID_a8db26a79       4
ID_a62966799       4
