## Elo Merchant Category Recommendation

### Import libraries

In [0]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import os
import time
import warnings
import gc
gc.collect()
import pickle
import os
from six.moves import urllib
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from collections import Counter

In [0]:
#Add All the Models Libraries

# Scalers
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

# Models

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error
import lightgbm as lgb
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from scipy.stats import reciprocal, uniform

from sklearn.model_selection import StratifiedKFold, RepeatedKFold

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Common data processors
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from scipy import sparse
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

### Loading data from kaggle

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
#Using Kaggle datasets in Colab
# Run this cell and select the kaggle.json file downloaded
# from the Kaggle account settings page.
from google.colab import files
files.upload()
# Let's make sure the kaggle.json file is present.
!ls -lha kaggle.json
# Next, install the Kaggle API client.
!pip install -q kaggle
# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
# Copy the stackoverflow data set locally.
!kaggle competitions download -c elo-merchant-category-recommendation

Downloading sample_submission.csv.zip to /content
  0% 0.00/846k [00:00<?, ?B/s]
100% 846k/846k [00:00<00:00, 58.0MB/s]
Downloading new_merchant_transactions.csv.zip to /content
 83% 41.0M/49.4M [00:00<00:00, 82.0MB/s]
100% 49.4M/49.4M [00:00<00:00, 111MB/s] 
Downloading train.csv.zip to /content
  0% 0.00/3.02M [00:00<?, ?B/s]
100% 3.02M/3.02M [00:00<00:00, 100MB/s]
Downloading Data%20Dictionary.xlsx to /content
  0% 0.00/17.2k [00:00<?, ?B/s]
100% 17.2k/17.2k [00:00<00:00, 17.8MB/s]
Downloading historical_transactions.csv.zip to /content
 98% 539M/548M [00:04<00:00, 115MB/s]
100% 548M/548M [00:04<00:00, 127MB/s]
Downloading test.csv.zip to /content
  0% 0.00/1.13M [00:00<?, ?B/s]
100% 1.13M/1.13M [00:00<00:00, 165MB/s]
Downloading Data_Dictionary.xlsx to /content
  0% 0.00/17.2k [00:00<?, ?B/s]
100% 17.2k/17.2k [00:00<00:00, 17.9MB/s]
Downloading merchants.csv.zip to /content
 39% 5.00M/12.7M [00:00<00:00, 40.6MB/s]
100% 12.7M/12.7M [00:00<00:00, 61.8MB/s]


In [0]:
#Extract zip files
import zipfile
for file in ['/content/sample_submission.csv.zip','/content/historical_transactions.csv.zip','/content/train.csv.zip','/content/test.csv.zip', '/content/new_merchant_transactions.csv.zip', '/content/merchants.csv.zip' ]:
    zip_ref = zipfile.ZipFile(file, 'r')
    zip_ref.extractall()
    zip_ref.close()

## Feature engineering

In [0]:
#Reduce the memory usage - Inspired by Panchajanya Banerjee
#determine and apply the smallest data type that can fit the range of values
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    '''
    Reduce the memory usage by applying the smallest data type that can fit the range of values
    '''
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
def aggregate_transaction_hist(trans, prefix):  
    '''
    Create aggrgate features from all hitorical transaction features
    '''
    agg_func = {
          'purchase_amount' : ['sum','max','min','mean','var','skew'],
          'installments' : ['sum','max','mean','var','skew'],
          'purchase_date' : ['max','min'],
          'month_lag' : ['max','min','mean','var','skew'],
          'month_diff' : ['max','min','mean','var','skew'],
          'weekend' : ['sum', 'mean'],
          'weekday' : ['sum', 'mean'],
          'authorized_flag': ['sum', 'mean'],
          'category_1': ['sum','mean', 'max','min'],
          'card_id' : ['size','count'],
          'month': ['nunique', 'mean', 'min', 'max'],
          'hour': ['nunique', 'mean', 'min', 'max'],
          'weekofyear': ['nunique', 'mean', 'min', 'max'],
          'day': ['nunique', 'mean', 'min', 'max'],
          'subsector_id': ['nunique'],
          'merchant_id': ['nunique'],
          'merchant_category_id' : ['nunique'],
          'price' :['sum','mean','max','min','var'],
          'duration' : ['mean','min','max','var','skew'],
          'amount_month_ratio':['mean','min','max','var','skew']
        
      }
    
    agg_trans = trans.groupby(['card_id']).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() 
                           for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    
    df = (trans.groupby('card_id')
            .size()
            .reset_index(name='{}transactions_count'.format(prefix)))
    
    agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')
    
    return agg_trans

In [0]:
def aggregate_transaction_new(trans, prefix):  
    '''
    Create aggrgate features from all new_transactions features
    '''        
    agg_func = {
          'purchase_amount' : ['sum','max','min','mean','var','skew'],
          'installments' : ['sum','max','mean','var','skew'],
          'purchase_date' : ['max','min'],
          'month_lag' : ['max','min','mean','var','skew'],
          'month_diff' : ['max','min','mean','var','skew'],
          'weekend' : ['sum', 'mean'],
          'weekday' : ['sum', 'mean'],
          'authorized_flag': ['sum', 'mean'],
          'category_1': ['sum','mean', 'max','min'],
          'card_id' : ['size','count'],
          'month': ['nunique', 'mean', 'min', 'max'],
          'hour': ['nunique', 'mean', 'min', 'max'],
          'weekofyear': ['nunique', 'mean', 'min', 'max'],
          'day': ['nunique', 'mean', 'min', 'max'],
          'subsector_id': ['nunique'],
          'merchant_category_id' : ['nunique'],
          'price' :['sum','mean','max','min','var'],
          'duration' : ['mean','min','max','var','skew'],
          'amount_month_ratio':['mean','min','max','var','skew']
    }
    
    agg_trans = trans.groupby(['card_id']).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() 
                           for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    
    df = (trans.groupby('card_id')
            .size()
            .reset_index(name='{}transactions_count'.format(prefix)))
    
    agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')
    
    return agg_trans

In [0]:
def merchants_agg(merchants):
    '''
    Aggrgating with Mean and mode for various features
    '''
    mode2 = lambda x: Counter(x).most_common(1)[0][0]
    merchants = merchants.groupby("merchant_id",as_index=False).agg({
        "merchant_group_id": mode2,
        "merchant_category_id": mode2,
        "subsector_id": mode2,
        "numerical_1": "mean",
        "numerical_2": "mean",
        "category_1": mode2,
        "most_recent_sales_range": mode2,
        "most_recent_purchases_range": mode2,
        "avg_sales_lag3": "mean",
        "avg_purchases_lag3": "mean",
        "active_months_lag3": mode2,
        "avg_sales_lag6": "mean",
        "avg_purchases_lag6": "mean",
        "active_months_lag6": mode2,
        "avg_sales_lag12": "mean",
        "avg_purchases_lag12": "mean",
        "active_months_lag12": mode2,
        "category_4": mode2,
        "city_id": mode2,
        "state_id": mode2,
        "category_2": mode2
    })

    return merchants

In [0]:
#Aggregating by card_id since there are duplicated card_id's:
#Numerical features: mean (without missings)
#Non-numerical features: mode
def new_transactions_agg(new_transactions):
    '''
    Aggrgating with Mean and mode for various features
    '''
    mode2 = lambda x: Counter(x).most_common(1)[0][0]
    new_transactions = new_transactions.groupby("card_id",as_index=False).agg({
        "authorized_flag": mode2,
        "merchant_category_id": mode2,
        "subsector_id": mode2,
        "merchant_id":  mode2,
        "category_1": mode2,
        'month_lag': mode2,
        "installments": mode2,
        "purchase_amount": "mean",
        "city_id": mode2,
        "state_id": mode2,
        "category_2": mode2,
        "purchase_date": mode2
    })
    return new_transactions

In [0]:
#Aggregating by card_id since there are duplicated card_id's:
def mean_encode(train, new_transactions, merchants, suffix):
    '''
    target Mean encoding of categorical features
    '''
    new_transactions = new_transactions_agg(new_transactions)
    new_transactions.columns = [col+suffix if col not in ["card_id", "merchant_id"] else col for col in new_transactions.columns]

    # Select only train card_id's
    train_new_trans = pd.merge(train['card_id'], new_transactions, on='card_id',how='left')
    del new_transactions

    # Similarly Aggregating by merchant_id since there are some duplicated merchants_id with different features values:
    merchants = merchants_agg(merchants)

    #Adding Merchants features to train data
    merchants.columns = [col+"_merchants"+suffix if col!="merchant_id" else col for col in merchants.columns]
    all_transactions_merchants = pd.merge(train_new_trans, merchants, on='merchant_id',how='left')
    del merchants
    del train_new_trans
    del all_transactions_merchants['merchant_id']

    card_ids_train = train["card_id"].unique()
    idxs = all_transactions_merchants["card_id"].isin(list(card_ids_train))
    all_transactions_merchants_filter = all_transactions_merchants.loc[idxs]
    all_transactions_merchants_filter_target = all_transactions_merchants_filter.merge(train[["card_id",\
                                           "target"]],how="left",left_on="card_id",right_on="card_id")
    del all_transactions_merchants_filter
    gc.collect()

    all_transactions_merchants.fillna(-9999,inplace=True)


    # Mean encoding
    features_mean_encoding = ['authorized_flag'+suffix, 'category_1'+suffix, 'month_lag'+suffix, 'installments'+suffix,'category_2'+suffix,
        'numerical_1_merchants'+suffix, 'category_1_merchants'+suffix, 'most_recent_sales_range_merchants'+suffix,
        'most_recent_purchases_range_merchants'+suffix, 'active_months_lag3_merchants'+suffix, 
        'active_months_lag6_merchants'+suffix, 'active_months_lag12_merchants'+suffix, 'category_4_merchants'+suffix,
        'category_2_merchants'+suffix]

    for col in features_mean_encoding:
        mean_encoding = all_transactions_merchants_filter_target.groupby(col).target.mean()
        all_transactions_merchants[col+"_mean_encoded"] = all_transactions_merchants[col].\
                                                                 map(mean_encoding)
    del mean_encoding
    gc.collect()   
    return all_transactions_merchants

#### Train function

In [0]:
def getfeatures_train(train, transactions = None, new_transactions = None, merchants = None):
  # create Outliers feature 
  '''
  create train features from historical, new and merchants trancactions
  '''
  train['outliers'] = 0
  train.loc[train['target'] < -30, 'outliers'] = 1
  train['outliers'].value_counts()
  # Mean encoding features with outlier feature
  for features in ['feature_1','feature_2','feature_3']:
      order_label = train.groupby([features])['outliers'].mean()
      train[features+"_mean"] = train[features].map(order_label)
      
  # Now extract the days and Qtr
  train['days'] = (date(2018, 2, 1) - train['first_active_month'].dt.date).dt.days
  train['quarter'] = train['first_active_month'].dt.quarter

  feature_cols = ['feature_1', 'feature_2', 'feature_3']
  for f in feature_cols:
      train['days_' + f] = train['days'] * train[f]
      train['days_' + f + '_ratio'] = train[f] / train['days']

  gc.collect()
  
  # Loading historical_transactions data
  transactions = reduce_mem_usage(pd.read_csv('../content/historical_transactions.csv'))
  transactions.replace([-np.inf, np.inf], np.nan, inplace=True)
  print('Preprocessing hist_trans...')
  # Feature extraction historical transaction
  #impute missing values.
  transactions['category_2'] = transactions['category_2'].fillna(-9999,inplace=True)
  transactions['category_3'] = transactions['category_3'].fillna('-9999',inplace=True)
  transactions['merchant_id'] = transactions['merchant_id'].fillna('-9999',inplace=True)
  transactions['installments'].replace(-1, np.nan,inplace=True)
  transactions['installments'].replace(999, np.nan,inplace=True)
  transactions['purchase_amount'] = transactions['purchase_amount'].apply(lambda x: min(x, 0.8))
  # Feature engineering
  transactions['authorized_flag'] = transactions['authorized_flag'].map({'Y': 1, 'N': 0})
  transactions['category_1'] = transactions['category_1'].map({'Y': 1, 'N': 0})
  transactions['category_3'] = transactions['category_3'].map({'A':0, 'B':1, 'C':2})
  transactions['purchase_date'] = pd.to_datetime(transactions['purchase_date'])
  transactions['weekofyear'] = transactions['purchase_date'].dt.weekofyear
  transactions['month'] = transactions['purchase_date'].dt.month
  transactions['day'] = transactions['purchase_date'].dt.day
  transactions['weekday'] = transactions.purchase_date.dt.weekday
  transactions['weekend'] = (transactions.purchase_date.dt.weekday >=5).astype(int)
  transactions['hour'] = transactions['purchase_date'].dt.hour 
  transactions['month_diff'] = ((datetime.today() - transactions['purchase_date']).dt.days)//30
  transactions['month_diff'] += transactions['month_lag']
  transactions['duration'] = transactions['purchase_amount']*transactions['month_diff']
  transactions['amount_month_ratio'] = transactions['purchase_amount']/transactions['month_diff']
  transactions['price'] = transactions['purchase_amount'] / transactions['installments']

  gc.collect()

  agg_func = {
          'mean': ['mean'],
     }
  for col in ['category_2','category_3']:
      transactions[col+'_mean'] = transactions['purchase_amount'].groupby(transactions[col]).agg('mean')
      transactions[col+'_max'] = transactions['purchase_amount'].groupby(transactions[col]).agg('max')
      transactions[col+'_min'] = transactions['purchase_amount'].groupby(transactions[col]).agg('min')
      transactions[col+'_sum'] = transactions['purchase_amount'].groupby(transactions[col]).agg('sum')
      agg_func[col+'_mean'] = ['mean']
    
  gc.collect()

  # Create Aggregate features on historical transaction
  merge_hist = aggregate_transaction_hist(transactions, prefix='hist_')
  del transactions
  gc.collect()
  train = pd.merge(train, merge_hist, on='card_id',how='left')
  del merge_hist
  gc.collect()
      
  #Feature Engineering - Adding new features inspired by Chau's first kernel
  train['hist_purchase_date_max'] = pd.to_datetime(train['hist_purchase_date_max'])
  train['hist_purchase_date_min'] = pd.to_datetime(train['hist_purchase_date_min'])
  train['hist_purchase_date_diff'] = (train['hist_purchase_date_max'] - train['hist_purchase_date_min']).dt.days
  train['hist_purchase_date_average'] = train['hist_purchase_date_diff']/train['hist_card_id_size']
  train['hist_purchase_date_uptonow'] = (datetime.today() - train['hist_purchase_date_max']).dt.days
  train['hist_purchase_date_uptomin'] = (datetime.today() - train['hist_purchase_date_min']).dt.days
  train['hist_first_buy'] = (train['hist_purchase_date_min'] - train['first_active_month']).dt.days
  train['hist_last_buy'] = (train['hist_purchase_date_max'] - train['first_active_month']).dt.days

  for feature in ['hist_purchase_date_max','hist_purchase_date_min']:
      train[feature] = train[feature].astype(np.int64) * 1e-9
  gc.collect()

  #Loading data new_merchant_transactions
  new_transactions = reduce_mem_usage(pd.read_csv('/content/new_merchant_transactions.csv').sort_values('card_id',ascending=False))
  new_transactions.replace([-np.inf, np.inf], np.nan, inplace=True)
  print('Preprocessing new_trans...')

  # Feature extraction new transaction
  #impute missing values
  new_transactions['category_2'] = new_transactions['category_2'].fillna(-9999,inplace=True)
  new_transactions['category_3'] = new_transactions['category_3'].fillna('-9999',inplace=True)
  new_transactions['merchant_id'] = new_transactions['merchant_id'].fillna('-9999',inplace=True)
  new_transactions['installments'].replace(-1, np.nan,inplace=True)
  new_transactions['installments'].replace(999, np.nan,inplace=True)
  new_transactions['purchase_amount'] = new_transactions['purchase_amount'].apply(lambda x: min(x, 0.8))

  #Feature Engineering - Adding new features inspired by Chau's first kernel
  new_transactions['authorized_flag'] = new_transactions['authorized_flag'].map({'Y': 1, 'N': 0})
  new_transactions['category_1'] = new_transactions['category_1'].map({'Y': 1, 'N': 0})
  new_transactions['category_3'] = new_transactions['category_3'].map({'A':0, 'B':1, 'C':2}) 

  new_transactions['purchase_date'] = pd.to_datetime(new_transactions['purchase_date'])
  new_transactions['month'] = new_transactions['purchase_date'].dt.month
  new_transactions['weekofyear'] = new_transactions['purchase_date'].dt.weekofyear
  new_transactions['day'] = new_transactions['purchase_date'].dt.day
  new_transactions['weekday'] = new_transactions.purchase_date.dt.weekday
  new_transactions['weekend'] = (new_transactions.purchase_date.dt.weekday >=5).astype(int)
  new_transactions['hour'] = new_transactions['purchase_date'].dt.hour 
  new_transactions['month_diff'] = ((datetime.today() - new_transactions['purchase_date']).dt.days)//30
  new_transactions['month_diff'] += new_transactions['month_lag']

  # additional features
  new_transactions['duration'] = new_transactions['purchase_amount']*new_transactions['month_diff']
  new_transactions['amount_month_ratio'] = new_transactions['purchase_amount']/new_transactions['month_diff']
  new_transactions['price'] = new_transactions['purchase_amount'] / new_transactions['installments']

  aggs = {
          'mean': ['mean'],
      }

  for col in ['category_2','category_3']:
      new_transactions[col+'_mean'] = new_transactions['purchase_amount'].groupby(new_transactions[col]).agg('mean')
      new_transactions[col+'_max'] = new_transactions['purchase_amount'].groupby(new_transactions[col]).agg('max')
      new_transactions[col+'_min'] = new_transactions['purchase_amount'].groupby(new_transactions[col]).agg('min')
      new_transactions[col+'_var'] = new_transactions['purchase_amount'].groupby(new_transactions[col]).agg('var')
      aggs[col+'_mean'] = ['mean']

  gc.collect()

  # Create Aggregate features on new transaction
  merge_new = aggregate_transaction_new(new_transactions, prefix='new_')
  del new_transactions

  train = pd.merge(train, merge_new, on='card_id',how='left')
  del merge_new
  gc.collect()

  #Feature Engineering - Adding new features inspired by Chau's first kernel
  train['new_purchase_date_max'] = pd.to_datetime(train['new_purchase_date_max'])
  train['new_purchase_date_min'] = pd.to_datetime(train['new_purchase_date_min'])
  train['new_purchase_date_diff'] = (train['new_purchase_date_max'] - train['new_purchase_date_min']).dt.days
  train['new_purchase_date_average'] = train['new_purchase_date_diff']/train['new_card_id_size']
  train['new_purchase_date_uptonow'] = (datetime.today() - train['new_purchase_date_max']).dt.days
  train['new_purchase_date_uptomin'] = (datetime.today() - train['new_purchase_date_min']).dt.days
  train['new_first_buy'] = (train['new_purchase_date_min'] - train['first_active_month']).dt.days
  train['new_last_buy'] = (train['new_purchase_date_max'] - train['first_active_month']).dt.days
  for feature in ['new_purchase_date_max','new_purchase_date_min']:
      train[feature] = train[feature].astype(np.int64) * 1e-9
  gc.collect()

  #NEW Features referred from https://www.kaggle.com/mfjwr1/simple-lightgbm-without-blending
  train['card_id_total'] = train['new_card_id_size']+train['hist_card_id_size']
  train['card_id_cnt_total'] = train['new_card_id_count']+train['hist_card_id_count']
  train['card_id_cnt_ratio'] = train['new_card_id_count']/train['hist_card_id_count']
  train['purchase_amount_total'] = train['new_purchase_amount_sum']+train['hist_purchase_amount_sum']
  train['purchase_amount_mean'] = train['new_purchase_amount_mean']+train['hist_purchase_amount_mean']
  train['purchase_amount_max'] = train['new_purchase_amount_max']+train['hist_purchase_amount_max']
  train['purchase_amount_min'] = train['new_purchase_amount_min']+train['hist_purchase_amount_min']
  train['purchase_amount_ratio'] = train['new_purchase_amount_sum']/train['hist_purchase_amount_sum']
  train['month_diff_mean'] = train['new_month_diff_mean']+train['hist_month_diff_mean']
  train['month_diff_ratio'] = train['new_month_diff_mean']/train['hist_month_diff_mean']
  train['month_lag_mean'] = train['new_month_lag_mean']+train['hist_month_lag_mean']
  train['month_lag_max'] = train['new_month_lag_max']+train['hist_month_lag_max']
  train['month_lag_min'] = train['new_month_lag_min']+train['hist_month_lag_min']
  train['category_1_mean'] = train['new_category_1_mean']+train['hist_category_1_mean']
  train['installments_total'] = train['new_installments_sum']+train['hist_installments_sum']
  train['installments_mean'] = train['new_installments_mean']+train['hist_installments_mean']
  train['installments_max'] = train['new_installments_max']+train['hist_installments_max']
  train['installments_ratio'] = train['new_installments_sum']/train['hist_installments_sum']
  train['price_total'] = train['purchase_amount_total'] / train['installments_total']
  train['price_mean'] = train['purchase_amount_mean'] / train['installments_mean']
  train['price_max'] = train['purchase_amount_max'] / train['installments_max']
  train['duration_mean'] = train['new_duration_mean']+train['hist_duration_mean']
  train['duration_min'] = train['new_duration_min']+train['hist_duration_min']
  train['duration_max'] = train['new_duration_max']+train['hist_duration_max']
  train['amount_month_ratio_mean']=train['new_amount_month_ratio_mean']+train['hist_amount_month_ratio_mean']
  train['amount_month_ratio_min']=train['new_amount_month_ratio_min']+train['hist_amount_month_ratio_min']
  train['amount_month_ratio_max']=train['new_amount_month_ratio_max']+train['hist_amount_month_ratio_max']
  train['new_CLV'] = train['new_card_id_count'] * train['new_purchase_amount_sum'] / train['new_month_diff_mean']
  train['hist_CLV'] = train['hist_card_id_count'] * train['hist_purchase_amount_sum'] / train['hist_month_diff_mean']
  train['CLV_ratio'] = train['new_CLV'] / train['hist_CLV']
  gc.collect()

  # Mean encoding
  print("train", train.shape)
  print('Mean encoding...')
  #Mean encoding new_merchants_transactions
  # Loading data
  merchants = reduce_mem_usage(pd.read_csv('/content/merchants.csv'))
  merchants.replace([-np.inf, np.inf], np.nan, inplace=True)
  
  new_transactions = reduce_mem_usage(pd.read_csv('/content/new_merchant_transactions.csv'))
  new_transactions.replace([-np.inf, np.inf], np.nan, inplace=True)

  # Mean encoding
  new_trans_mean = mean_encode(train, new_transactions, merchants, suffix='_new')
  print('new mean encoding Done')
  train = pd.merge(train, new_trans_mean, on='card_id',how='left')
  del new_trans_mean
  print("train", train.shape)
  # Mean encoding hist_merchants_transactions
  # Loading data
  merchants = reduce_mem_usage(pd.read_csv('/content/merchants.csv'))
  merchants.replace([-np.inf, np.inf], np.nan, inplace=True)
  hist_transactions = reduce_mem_usage(pd.read_csv('../content/historical_transactions.csv'))
  hist_transactions.replace([-np.inf, np.inf], np.nan, inplace=True)
  
  # Mean encoding
  hist_trans_mean = mean_encode(train, hist_transactions, merchants, suffix='_hist')
  print('hist mean encoding Done')
  train = pd.merge(train, hist_trans_mean, on='card_id',how='left')
  del hist_trans_mean

  train_col = ['authorized_flag_new', 'category_1_new', 'month_lag_new', 'installments_new','category_2_new',
          'numerical_1_merchants_new', 'category_1_merchants_new', 'most_recent_sales_range_merchants_new',
          'most_recent_purchases_range_merchants_new', 'active_months_lag3_merchants_new', 
          'active_months_lag6_merchants_new', 'active_months_lag12_merchants_new', 'category_4_merchants_new',
          'category_2_merchants_new', 
          'authorized_flag_hist', 'category_1_hist', 'month_lag_hist', 'installments_hist', 'category_2_hist',
          'numerical_1_merchants_hist', 'category_1_merchants_hist', 'most_recent_sales_range_merchants_hist',
          'most_recent_purchases_range_merchants_hist', 'active_months_lag3_merchants_hist', 
          'active_months_lag6_merchants_hist', 'active_months_lag12_merchants_hist', 'category_4_merchants_hist',
          'category_2_merchants_hist']

  for col in train.columns:
      if train[col].isna().any():
          train[col].replace([-np.inf, np.inf, np.nan], train[col].min(), inplace=True)
    
  #deleting basic columns
  del_col = ['feature_1','feature_2','feature_3', 'authorized_flag_new', 'merchant_category_id_new',
       'subsector_id_new', 'category_1_new', 'month_lag_new',
       'installments_new', 'purchase_amount_new', 'city_id_new',
       'state_id_new', 'category_2_new', 'purchase_date_new',
       'merchant_group_id_merchants_new', 'merchant_category_id_merchants_new',
       'subsector_id_merchants_new', 'numerical_1_merchants_new',
       'numerical_2_merchants_new', 'category_1_merchants_new',
       'most_recent_sales_range_merchants_new',
       'most_recent_purchases_range_merchants_new',
       'avg_sales_lag3_merchants_new', 'avg_purchases_lag3_merchants_new',
       'active_months_lag3_merchants_new', 'avg_sales_lag6_merchants_new',
       'avg_purchases_lag6_merchants_new', 'active_months_lag6_merchants_new',
       'avg_sales_lag12_merchants_new', 'avg_purchases_lag12_merchants_new',
       'active_months_lag12_merchants_new', 'category_4_merchants_new',
       'city_id_merchants_new', 'state_id_merchants_new',
       'category_2_merchants_new', 'authorized_flag_hist',
       'merchant_category_id_hist', 'subsector_id_hist', 'category_1_hist',
       'month_lag_hist', 'installments_hist', 'purchase_amount_hist',
       'city_id_hist', 'state_id_hist', 'category_2_hist',
       'purchase_date_hist', 'merchant_group_id_merchants_hist',
       'merchant_category_id_merchants_hist', 'subsector_id_merchants_hist',
       'numerical_1_merchants_hist', 'numerical_2_merchants_hist',
       'category_1_merchants_hist', 'most_recent_sales_range_merchants_hist',
       'most_recent_purchases_range_merchants_hist',
       'avg_sales_lag3_merchants_hist', 'avg_purchases_lag3_merchants_hist',
       'active_months_lag3_merchants_hist', 'avg_sales_lag6_merchants_hist',
       'avg_purchases_lag6_merchants_hist',
       'active_months_lag6_merchants_hist', 'avg_sales_lag12_merchants_hist',
       'avg_purchases_lag12_merchants_hist',
       'active_months_lag12_merchants_hist', 'category_4_merchants_hist',
       'city_id_merchants_hist', 'state_id_merchants_hist',
       'category_2_merchants_hist']
  train = train.drop(del_col, axis = 1)
  train = train.loc[:,train.columns[0:200]]
  Exclude = ['first_active_month', 'target','merchant_id', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size']
  df_train_columns = [c for c in train.columns if c not in Exclude] 
  train = train[df_train_columns]
  train.head()
  #train.to_pickle("/content/train_men.pkl")
  return train


## final_function_1

In [0]:
def final_fun_1(train, transactions = None, new_transactions = None, merchants = None):
    #Preprocessing train and test data
    train= getfeatures_train(train, transactions = None, new_transactions = None, merchants = None)
    filename = '/content/drive/My Drive/Colab Notebooks/CS1/pkl_old/pkl/lgbm1.sav'
    lgbm = pd.read_pickle(filename)
    train_predictions = lgbm.predict(train)
    
    return train_predictions

## final_function_2

In [0]:
def final_fun_2(train, target, transactions = None, new_transactions = None, merchants = None):
    train_predictions = final_fun_1(train, transactions = None, new_transactions = None, merchants = None)
    print("\n Actual value:", target)
    print("\n predicted value:", train_predictions)
    print("\n Root mean squared error: {:<8.5f}".format(mean_squared_error(train_predictions, target)**0.5))

In [50]:
train = reduce_mem_usage(pd.read_csv('/content/train.csv',parse_dates=["first_active_month"]))

Mem. usage decreased to  4.04 Mb (56.2% reduction)


### Metrics: RMSE of single data point 

In [51]:
%%time
final_fun_2(train[0:1], train['target'][0:1])

Mem. usage decreased to 1749.11 Mb (43.7% reduction)
Preprocessing hist_trans...
Mem. usage decreased to 129.17 Mb (42.5% reduction)
Preprocessing new_trans...
train (1, 199)
Mean encoding...
Mem. usage decreased to 30.32 Mb (46.0% reduction)
Mem. usage decreased to 114.20 Mb (45.5% reduction)
new mean encoding Done
train (1, 245)
Mem. usage decreased to 30.32 Mb (46.0% reduction)
Mem. usage decreased to 1749.11 Mb (43.7% reduction)
hist mean encoding Done

 Actual value: 0   -0.820312
Name: target, dtype: float16

 predicted value: [-0.25214436]

 Root mean squared error: 0.56817 
CPU times: user 35min 22s, sys: 52 s, total: 36min 14s
Wall time: 36min 10s
