### Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import operator
import numpy as np
import regex as re
import joblib
import os
import copy

from tqdm import tqdm

### Declaring methods

In [2]:
def feature_engineering_perm(data, fe_imp):
    '''
    This method is used to engineer new features(+, -, *, /, cos, cosh, sin , sinh, exp of existing features) from the given
    list of feature transformations and the dataset.
    
    ----------------------
    Parameter
    data    : The input dataset.
    
    fe_imp  : A pandas series containing the top important feature combinations of arithmetic and trignometric
              operations.
              
    ----------------------                       
    Returns
    data_fe : A pandas DataFrame containing dataset with only the given transformed features.
    '''
        
    data_fe = pd.DataFrame()
    #defining a dict of operation name and the operations
    op_dict = {'+': operator.add, '-': operator.sub, '*': operator.mul, '/': operator.truediv,
              'cos': np.cos, 'sin': np.sin, 'cosh': np.cosh, 'sinh': np.sinh, 'exp': np.exp}
    
    for i in fe_imp.index:
        oper = ''.join(re.findall('[^0-9_]', i)) #extracting the arith or trig operation from the name of col.Eg:12+32 or cos_21
        if(not oper):
            data_fe[i] = data[i].copy() #if there is no operation involved then the same feature is taken as it is.
        else:
            op = op_dict[oper]
            if('_' in i): #checking if the operation is trig
                cols = i.split(oper+'_') #splitting based on the '_' Eg: cos_12 -> cos, 12
                data_fe[i] = op(data[cols[1]]) #appying the trig operation
            else: 
                cols = i.split(oper) #splitting based on the operation Eg: 12+13 -> splitting on '+' -> 12,13
                data_fe[i] = op(data[cols[0]], data[cols[1]]) #applying the arth operation
                
        if(data_fe[i].isin([np.inf, -np.inf, np.nan]).any()):
            data_fe[i].replace([np.inf, -np.inf], np.nan, inplace=True) #replaces the inf, -inf to nan(coz mean of r.v with inf is nan)
            data_fe[i].replace(np.nan, data_fe[i].mean(), inplace=True) # replaces nan with mean val
        

    return data_fe

In [3]:
def save_test_pred(model_name, feature_imp, test):
    '''
    This method saves the predicted probabilities of the test dataset on the given model.
    
    ----------------------
    Parameter
    
    model_name  : The name of the model saved to run the test dataset with.
    
    feature_imp : Pandas DataFrame containing the engineered features and thier corresponding importance.
    
    test        : Test dataset.
    
    
    ----------------------              
    Returns
    None 
    '''
        
    print('Model: ', model_name)
    path = 'saved_models/'
    ouptut_path = 'submission/csv/'
    if not os.path.isdir(ouptut_path):
        os.mkdir(ouptut_path)
    
    
    feat_select = {
        'Chi': 'chi2_score',
        'Cor': 'abs_correlation',
        'Hmean': 'harmonic_mean'
    }
    
    file_path = path + model_name
    clf = joblib.load(file_path)
    
    output_file = model_name.split('.')[0]+'.csv'
    model_type, fi_method = output_file.split('_')[:2]
    
    #checking how many features are used in training the model from the model name(defalut is 300)
    top = 300 if not re.findall('\d+', model_type) else int(re.sub('[^0-9]', '', model_type))
    
    if('EngFe' in  model_type): #if the model used Engineered features then the features are engineered for the test data
        method = feat_select[fi_method]
        top_features = feature_imp[method].sort_values(ascending=False)[: top]
        test_fe = feature_engineering_perm(test, top_features)
    else:
        
        test_fe = test
    
   
    pred = clf.predict_proba(test_fe)[:,1] #predicting the probability of data point belonging to class 1.
    
    sub = pd.DataFrame(zip(list(range(250, len(pred)+250)), pred), columns  = ['id', 'target'])
    sub.to_csv(ouptut_path+output_file, index = False)  #saving the result

In [4]:
def saveAll_test_pred(feature_imp, test, cat = ''):
    '''
    This method saves the predicted probabilities of the test dataset on all the saved models.
    
    ----------------------
    Parameter
    
    feature_imp : Pandas DataFrame containing the engineered features and thier corresponding importance.
    
    test        : Test dataset.
    
    cat         : The Category of the saved model.
    
    
    ----------------------              
    Returns
    None 
    '''
        
    path = 'saved_models/'
    csv_path = 'submission/'
    models = sorted(os.listdir(path), key= lambda x: os.path.getmtime(os.path.join(path, x))) #files sorted by time
    
    models = [i for i in models if (cat in i)] #taking only the file that belongs to the given category
    
    
    for model_name in tqdm(models):
        file_path = path + model_name
        clf = joblib.load(file_path)

        feat_select = {
            'Chi': 'chi2_score',
            'Cor': 'abs_correlation',
            'Hmean': 'harmonic_mean'
        }


        output_file = model_name.split('.')[0]+'.csv'
        model_type, fi_method = output_file.split('_')[:2]
        #print(output_file)
        #checking how many features are used in training the model from the model name(defalut is 300)
        top = 300 if not re.findall('\d+', model_type) else int(re.sub('[^0-9]', '', model_type))

        if('EngFe' in  model_type):#if the model used Engineered features then the features are engineered for the test data
            method = feat_select[fi_method]
            top_features = feature_imp[method].sort_values(ascending=False)[: top]
            test_fe = feature_engineering_perm(test, top_features)
            
        elif(('OrgFe' in model_type) and (top != 300)):
            #print('got')
            method = feat_select[fi_method]
            original_col = [i for i in feature_imp.index if not ''.join(re.findall('[^0-9_]', i))]
            top_features = feature_imp.loc[original_col][method].sort_values(ascending = False)[:top]
            test_fe = test.loc[:,top_features.index].copy()#getting only the imp columns
            
        else:

            test_fe = test.copy()
        #print(test_fe.columns)
        pred = clf.predict_proba(test_fe)[:,1]#predicting the probability of data point belonging to class 1.

        sub = pd.DataFrame(zip(list(range(250, len(pred)+250)), pred), columns  = ['id', 'target'])
        sub.to_csv(csv_path+output_file, index = False)#saving the result

In [5]:
def submit_csv_kaggle(cat = '', top = 1, msg = '',submit = False):
    '''
    This method saves the predicted probabilities of the test dataset on all the saved models.
    
    ----------------------
    Parameter
    
    cat   : The Category of the saved model.
    
    top   : Rank upto which we have to submit the test data predictions.
    
    msg   :Custom message when submitting the test data.
    
    submit: A flag to enable or disable submit functionality.
    
    
    ----------------------              
    Returns
    count: The no of files submitted to kaggle. 
    '''
    count = 0
    dir_name = 'submission/' 
    csv_files = sorted(os.listdir(dir_name), key= lambda x: os.path.getmtime(os.path.join(dir_name, x)))
    
    #filtering the files with the given rank and the category
    csv_files = [i for i in csv_files if ((cat in i) and int(i.split('_')[-1][0]) in list(range(1, top+1)))]
    
    for csv_file in tqdm(csv_files):
        count += 1
        cmd_msg = msg + ' '+ csv_file.split('.')[0] if msg else csv_file.split('.')[0]
        cmd = f'kaggle competitions submit -c dont-overfit-ii -f {dir_name+csv_file} -m "{cmd_msg}"'
        if(submit):
            status_code = os.system(cmd) #runs the cmd and submits the csv to kaggle
        else:
            status_code = 0
            print(csv_file)
        if(status_code):
            print('Following cmd failed:\n', cmd)
            count-= 1
        
    return count

### Importing Data

In [6]:
feature_imp = pd.read_csv('dont-overfit-ii/processed_data/feature_importance.csv', index_col = 'Unnamed: 0')
test = pd.read_csv('dont-overfit-ii/test.csv').drop(['id'], axis=1)

### Predicting the target value and submitting it to kaggle

In [None]:
#predicts the target value for the given model and saves it in csv file
model_name = 'EngFe50_Hmean_1_LogisticRegression.sav'
save_test_pred(model_name, feature_imp, test)

In [48]:
#predicts the target value for the all models in the given category and saves it in csv file
saveAll_test_pred(feature_imp, test, cat = 'OrgFerkf_MinMaxScaler')

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:06<00:00,  1.30it/s]


In [11]:
#submits the csv file to kaggle
submit_csv_kaggle(cat = 'EngFe100_Cor_LogisticRegression_1', top = 3, msg = '',submit = False)

100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1007.76it/s]

EngFe100_Cor_LogisticRegression_1.csv





1