### Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import operator
import numpy as np
import regex as re
import joblib
import os
import copy
from sklearn.metrics import  roc_auc_score

from tqdm import tqdm

### Declaring methods

In [2]:
def feature_engineering_perm(data, fe_imp):
    '''
    This method is used to engineer new features(+, -, *, /, cos, cosh, sin , sinh, exp of existing features) from the given
    list of feature transformations and the dataset.
    
    ----------------------
    Parameter
    data    : The input dataset.
    
    fe_imp  : A list containing the top important feature combinations of arithmetic and trignometric
              operations.
              
    ----------------------                       
    Returns
    data_fe : A pandas DataFrame containing dataset with only the given transformed features.
    '''
        
    data_fe = pd.DataFrame()
    #defining a dict of operation name and the operations
    op_dict = {'+': operator.add, '-': operator.sub, '*': operator.mul, '/': operator.truediv,
              'cos': np.cos, 'sin': np.sin, 'cosh': np.cosh, 'sinh': np.sinh, 'exp': np.exp}
    
    for i in fe_imp:
        oper = ''.join(re.findall('[^0-9_]', i)) #extracting the arith or trig operation from the name of col.Eg:12+32 or cos_21
        if(not oper):
            data_fe[i] = data[i].copy() #if there is no operation involved then the same feature is taken as it is.
        else:
            op = op_dict[oper]
            if('_' in i): #checking if the operation is trig
                cols = i.split(oper+'_') #splitting based on the '_' Eg: cos_12 -> cos, 12
                data_fe[i] = op(data[cols[1]]) #appying the trig operation
            else: 
                cols = i.split(oper) #splitting based on the operation Eg: 12+13 -> splitting on '+' -> 12,13
                data_fe[i] = op(data[cols[0]], data[cols[1]]) #applying the arth operation
                
        if(data_fe[i].isin([np.inf, -np.inf, np.nan]).any()):
            data_fe[i].replace([np.inf, -np.inf], np.nan, inplace=True) #replaces the inf, -inf to nan(coz mean of r.v with inf is nan)
            data_fe[i].replace(np.nan, data_fe[i].mean(), inplace=True) # replaces nan with mean val
        

    return data_fe

### Importing Data

In [3]:
train = pd.read_csv('dont-overfit-ii/train.csv')
x_train = train.drop(['id','target'], axis=1)
y_train = train['target']


feature_imp = pd.read_csv('dont-overfit-ii/processed_data/feature_importance.csv', index_col = 'Unnamed: 0')
test = pd.read_csv('dont-overfit-ii/test.csv').drop(['id'], axis=1)

#clf = joblib.load('saved_models/EngFe100_Cor_LogisticRegression_1.sav')
clf = joblib.load('BestModel.sav')

print('Train Data:', end = '')
print(x_train.shape, y_train.shape)
print('Test Data(Without Target Variable):', end = '')
print(test.shape)

Train Data:(250, 300) (250,)
Test Data(Without Target Variable):(19750, 300)


### Final Function 1

In [4]:
def final_fun_1(X):
    '''
    This method will do data preprocessing and predict prob of the class of the datapoints using the best model.
    
    ----------------------
    Parameter
     
    X : Input data.
    
    ----------------------              
    Returns
    pred : Predicted probabilities of the given input.
    
    '''
    #Feature engineering
    top_features = feature_imp['abs_correlation'].sort_values(ascending = False)[:100].index
    X_fe = feature_engineering_perm(X, top_features)
    #print(X_fe)
    
    #predicting
    clf = joblib.load('BestModel.sav')
    pred = clf.predict_proba(X_fe)[:,1]
    
    return pred

final_fun_1(pd.DataFrame(x_train.loc[0]).T)

array([0.94877582])

### Final Function 2

In [5]:
def final_fun_2(X,Y):
    '''
    This method will do data preprocessing, predict prob of the class of the datapoints using the best model and finds the AUC ROC score.
    
    ----------------------
    Parameter
    
    X : Input data.
    Y : Target class of the input data.
    
    ----------------------              
    Returns
    score : AUCROC Score of the best model.
    
    '''
    #Feature engineering
    top_features = feature_imp['abs_correlation'].sort_values(ascending = False)[:100].index
    X_fe = feature_engineering_perm(X, top_features)
    
    #predicting
    clf = joblib.load('BestModel.sav')
    Y_pred = clf.predict_proba(X_fe)[:,1]
    
    #preformance metric of predictions
    score = roc_auc_score(Y, Y_pred)
    
    return score

final_fun_2(x_train, y_train)

0.9845833333333333