In [5]:
import pandas as pd 
import numpy as np
import pickle
import datetime
import eda_fe_module as eda_fe
from scipy.stats import gmean,hmean
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import gc

In [2]:
hist_train_preprocessed_obj = pickle.load(open("D:\Elo Merchant Category Recommendation\Train_Test_generated_features\hist_train_preprocessed_obj.pkl","rb"))

In [3]:
hist_train_obj = hist_train_preprocessed_obj[0]
hist_train_obj.columns

Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'category_3', 'merchant_category_id', 'merchant_id', 'month_lag',
       'purchase_amount', 'purchase_date', 'category_2', 'state_id',
       'subsector_id', 'new_purchase_amount', 'day', 'wday', 'month', 'year',
       'is_purchase_month_end', 'is_purchase_month_start',
       'is_purchase_quarter_start', 'is_purchase_quarter_end',
       'is_purchase_year_end', 'is_purchase_year_start', 'is_christmas',
       'is_mothers_day', 'is_childrens_day', 'is_valentines_day',
       'is_fathers_day'],
      dtype='object')

In [4]:
weekend_indices = (hist_train_obj.wday >= 5)
weekday_indices = (hist_train_obj.wday < 5)

In [5]:
hist_train_obj['weekend'] = np.full(hist_train_obj.shape[0],False)
hist_train_obj['weekday'] = np.full(hist_train_obj.shape[0],False)

hist_train_obj.loc[weekend_indices,'weekend'] = True
hist_train_obj.loc[weekday_indices,'weekday'] = True

hist_train_obj[hist_train_obj['weekend'] == True].head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,is_purchase_quarter_end,is_purchase_year_end,is_purchase_year_start,is_christmas,is_mothers_day,is_childrens_day,is_valentines_day,is_fathers_day,weekend,weekday
402,Y,C_ID_5037ff576e,138,N,1,B,705,M_ID_efc106141c,-9,-0.640069,...,False,False,False,False,False,False,False,False,True,False
404,Y,C_ID_5037ff576e,330,N,1,B,705,M_ID_393b4b8cec,-9,-0.67421,...,False,False,False,False,False,False,False,False,True,False
412,Y,C_ID_5037ff576e,193,N,1,B,278,M_ID_f187d5d98c,-10,-0.700326,...,False,False,False,False,False,False,False,False,True,False
417,Y,C_ID_5037ff576e,330,N,1,B,278,M_ID_80235aa624,-11,-0.677786,...,False,False,False,False,False,False,False,False,True,False
418,Y,C_ID_5037ff576e,-1,N,1,B,307,M_ID_d8281a0ff9,-11,-0.303581,...,False,False,False,False,False,False,False,False,True,False


In [6]:
def addPurchaseDateAndMonthDiffCols(s_inp_df): 
    print("Sorting columns by card_id and purchase date...")
    sorted_df = s_inp_df.sort_values(by=['card_id','purchase_date'])
    sorted_df['purchase_date'] = pd.to_datetime(sorted_df['purchase_date']) 
    sorted_df['purchase_date'] = sorted_df['purchase_date'].dt.date

    print("Adding date difference column...")
    sorted_df['date_diff'] = sorted_df.groupby(['card_id'])['purchase_date'].apply(lambda x : x.diff()).astype(str).apply(lambda x : x.split(' ')[0])
    sorted_df['date_diff'].replace(['NaT','nan'],'0',inplace = True)
    sorted_df['date_diff'].astype(int)

    print("Adding purchase amount difference column...")
    sorted_df['purch_amt_diff'] = sorted_df.groupby(['card_id'])['new_purchase_amount'].apply(lambda x : x.diff())
    sorted_df['purch_amt_diff'].replace(np.nan,0,inplace=True)

    reference_date = datetime.date(2018,2,1)
    print("Adding a column which shows difference from reference date...")
    sorted_df['ref_diff'] = sorted_df.groupby(['card_id'])['purchase_date'].apply(lambda x : reference_date-x)

    print("Obtaining days from above reference date difference...")
    sorted_df['ref_days_diff'] = sorted_df['ref_diff']/np.timedelta64(1,'D')
  
    print("Obtaining weeks from above reference date difference...")
    sorted_df['ref_weeks_diff'] = sorted_df['ref_diff']/np.timedelta64(1,'W')

    print("Obtaining months from above reference date difference...")
    sorted_df['ref_month_diff'] = sorted_df['ref_diff']/np.timedelta64(1,'M')

    print("Applying some sort of smoothing since we are going to obtain inf values if we divide by 0...")
    sorted_df['ref_month_diff'].replace(0,0.01,inplace=True)

    print("Creating a ratio of purchase amounts and reference month difference...")
    ## giving more importance to recent months as compared to previous months
    sorted_df['pamount_refmonth_ratio'] = sorted_df['new_purchase_amount']/sorted_df['ref_month_diff']
    
    return sorted_df

In [8]:
s_inp_df = hist_train_obj[['card_id','purchase_date','new_purchase_amount']]
sorted_df = addPurchaseDateAndMonthDiffCols(s_inp_df)
sorted_df.head()

Sorting columns by card_id and purchase date...
Adding date difference column...
Adding purchase amount difference column...
Adding a column which shows difference from reference date...
Obtaining days from above reference date difference...
Obtaining weeks from above reference date difference...
Obtaining months from above reference date difference...
Applying some sort of smoothing since we are going to obtain inf values if we divide by 0...
Creating a ratio of purchase amounts and reference month difference...


Unnamed: 0,card_id,purchase_date,new_purchase_amount,date_diff,purch_amt_diff,ref_diff,ref_days_diff,ref_weeks_diff,ref_month_diff,pamount_refmonth_ratio
19095896,C_ID_00007093c1,2017-02-14,521.799988,0,0.0,352 days,352.0,50.285714,11.564919,45.119207
19095775,C_ID_00007093c1,2017-02-14,186.0,0,-335.799988,352 days,352.0,50.285714,11.564919,16.083121
19095845,C_ID_00007093c1,2017-02-16,116.0,2,-70.0,350 days,350.0,50.0,11.499209,10.08765
19095866,C_ID_00007093c1,2017-02-20,276.399994,4,160.399994,346 days,346.0,49.428571,11.36779,24.314312
19095808,C_ID_00007093c1,2017-03-03,175.0,11,-101.399994,335 days,335.0,47.857143,11.006386,15.89986


In [10]:
sorted_df.shape

(18030009, 10)

In [11]:
hist_train_obj.shape

(18030009, 32)

In [13]:
pickle.dump(sorted_df,open("4th_Iteration_features/duration_diff_features.pkl","wb"))
pickle.dump(hist_train_obj,open("4th_Iteration_features/hist_train_obj.pkl","wb"))

In [14]:
del(hist_train_obj)

In [16]:
import gc
gc.collect()

14068

In [17]:
def createObservationDate(s_inp_df,trans_type):
    ## if new_transactions then substract month lag
    ## if hiistorical transactions then keep it as it is
    if trans_type == 'new':
        temp_df = s_inp_df.groupby('card_id').agg({'month_lag' : 'min', 'purchase_date' : 'min'}).reset_index()
        temp_df.columns = ['card_id', 'new_month_lag', 'new_purchase_date']
        temp_df['new_purchase_date'] = pd.to_datetime(temp_df['new_purchase_date'])
        temp_df['observation_date'] = temp_df.apply(lambda x: x['new_purchase_date']  - pd.DateOffset(months=x['new_month_lag']-1), axis=1)
        return temp_df[['card_id','observation_date']]
    elif trans_type == 'hist':
        temp_df = s_inp_df.groupby('card_id').agg({'month_lag' : 'max', 'purchase_date' : 'max'}).reset_index()
        temp_df.columns = ['card_id', 'new_month_lag', 'new_purchase_date']
        temp_df['new_purchase_date'] = pd.to_datetime(temp_df['new_purchase_date'])
        temp_df['observation_date'] = temp_df.apply(lambda x: x['new_purchase_date']  - pd.DateOffset(months=x['new_month_lag']), axis=1)
        return temp_df[['card_id','observation_date']]

In [52]:
def create4thIterationFeatures(trans_type,trans_df):
    
    print("Creating features in 1st round...")
    aggr_df = pd.DataFrame({'card_id':trans_df.card_id.unique()})
    aggr_df['purchase_date_max'] = trans_df.groupby(['card_id'])['purchase_date'].apply(lambda x : x.max()) 
    aggr_df['purchase_date_min'] = trans_df.groupby(['card_id'])['purchase_date'].apply(lambda x : x.min())
    aggr_df['transactions_count'] = trans_df.groupby(['card_id'])['purchase_date'].apply(lambda x : x.count())
    aggr_df['transactions_count'] = trans_df.groupby(['card_id'])['purchase_date'].apply(lambda x : x.count())
    aggr_df = pd.merge(aggr_df,createObservationDate(trans_df,trans_type),on="card_id",how="left")
    print("Shape of dataframe : ",aggr_df.shape)
    
    print("Creating features in 2nd round...")
    aggregate_dict = {
        'category_1' : ['mean'],
        'installments' : ['sum','mean','max'],
        'card_id' : ['count','size'],
        'new_purchase_amount' : [('sum','sum'),
                                ('perc_75',lambda x : np.percentile(x,q=75)),
                                ('perc_25',lambda x : np.percentile(x,q=25)),
                                ('mean','mean'),
                                ('head_sum',lambda x : x.head().sum()),
                                ('head_max',lambda x : x.head().max()),
                                ('tail_sum',lambda x : x.tail().sum()),
                                ('tail_max',lambda x : x.tail().max()),
                                ('gmean',lambda x : gmean),
                                ('hmean',lambda x : hmean)]
    }

    temp_df = trans_df.groupby(['card_id']).agg(aggregate_dict)
    temp_df.columns = ['_'.join(col).strip() for col in temp_df.columns.values]
    temp_df.reset_index(inplace=True)
    aggr_df = pd.merge(aggr_df,temp_df,on='card_id',how='left')
    print("Shape of dataframe : ",aggr_df.shape)
    
    print("Creating features in 3rd round...")
    cat_cols = ['category_1','category_2','category_3']
    for col in cat_cols:
        temp_df = eda_fe.generateAggrColumns(trans_df,col,'new_purchase_amount',['mean','sum'],col,True)
        aggr_df = pd.merge(aggr_df,temp_df,how='left',on='card_id')
    print("Shape of dataframe : ",aggr_df.shape)
    
    print("Creating features in 4th round...")
    temp_df = eda_fe.createNumericalAggr(trans_df,'card_id','new_purchase_amount')
    aggr_df = pd.merge(aggr_df,temp_df,how='left',on='card_id')
    aggr_df.columns = [trans_type + '_' + col for col in aggr_df.columns]
    print("Shape of dataframe : ",aggr_df.shape)
    
    return aggr_df


In [22]:
del(sorted_df)
gc.collect()

10565

In [23]:
hist_train_preprocessed_obj = pickle.load(open("D:\Elo Merchant Category Recommendation\Train_Test_generated_features\hist_train_preprocessed_obj.pkl","rb"))
hist_train_preprocessed_obj[0].shape

(18030009, 30)

In [12]:
def labelEncodedFeatures(temp_df):
    for col in temp_df.columns:
        print("Column considered : ",col)
        label_encoder = LabelEncoder()
        temp_df[col] = label_encoder.fit_transform(temp_df[col].values)
    return temp_df

In [53]:
hist_aggr_feat = create4thIterationFeatures("hist",hist_train_obj)

Creating features in 1st round...
Shape of dataframe :  (201917, 7)
Creating features in 2nd round...
Shape of dataframe :  (201917, 23)
Creating features in 3rd round...


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Category of Column :  0


 50%|██████████████████████████████████████████                                          | 1/2 [00:17<00:17, 17.20s/it]

Category of Column :  1


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:19<00:00,  9.63s/it]

***********Merge_df**********
(201917, 5)



  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Category of Column :  3.0


 20%|████████████████▊                                                                   | 1/5 [00:02<00:09,  2.27s/it]

Category of Column :  1.0


 40%|█████████████████████████████████▌                                                  | 2/5 [00:11<00:13,  4.48s/it]

***********Merge_df**********
(195097, 5)
Category of Column :  5.0


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:14<00:07,  3.88s/it]

***********Merge_df**********
(199139, 7)
Category of Column :  2.0


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:15<00:03,  3.00s/it]

***********Merge_df**********
(200390, 9)
Category of Column :  4.0


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:17<00:00,  3.46s/it]

***********Merge_df**********
(201917, 11)



  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Category of Column :  1


 33%|████████████████████████████                                                        | 1/3 [00:06<00:13,  6.73s/it]

Category of Column :  2


 67%|████████████████████████████████████████████████████████                            | 2/3 [00:08<00:05,  5.22s/it]

***********Merge_df**********
(150897, 5)
Category of Column :  0


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.62s/it]

***********Merge_df**********
(201917, 7)





Shape of dataframe :  (201917, 43)
Creating features in 4th round...
Shape of dataframe :  (201917, 48)


In [30]:
## Do it for new transaction features
## add these features
## feed it into the system
## See if there is any improvement

In [54]:
pickle.dump(hist_aggr_feat,open("4th_Iteration_features/hist_aggr_feat.pkl","wb"))

In [55]:
new_train_preprocessed_obj = pickle.load(open("Train_Test_generated_features/new_train_preprocessed_obj.pkl","rb"))
new_train_obj = new_train_preprocessed_obj[0]

In [62]:
new_train_obj.category_3.fillna('A',inplace=True)

In [63]:
label_encoder = LabelEncoder()
new_train_obj['category_3'] = label_encoder.fit_transform(new_train_obj['category_3'].values)

In [64]:
new_aggr_feat = create4thIterationFeatures("new",new_train_obj)

Creating features in 1st round...
Shape of dataframe :  (179986, 7)
Creating features in 2nd round...


  x2 = take(ap, indices_above, axis=axis) * weights_above
 50%|██████████████████████████████████████████                                          | 1/2 [00:00<00:00,  6.45it/s]

Shape of dataframe :  (179986, 23)
Creating features in 3rd round...
Category of Column :  1
Category of Column :  0


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.10it/s]

***********Merge_df**********
(179986, 5)



  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Category of Column :  nan
Category of Column :  1.0


 33%|████████████████████████████                                                        | 2/6 [00:01<00:02,  1.96it/s]

***********Merge_df**********
(120181, 5)
Category of Column :  3.0


 50%|██████████████████████████████████████████                                          | 3/6 [00:01<00:01,  1.98it/s]

***********Merge_df**********
(142708, 7)
Category of Column :  2.0


 67%|████████████████████████████████████████████████████████                            | 4/6 [00:01<00:00,  2.28it/s]

***********Merge_df**********
(148262, 9)
Category of Column :  4.0


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [00:02<00:00,  2.33it/s]

***********Merge_df**********
(157889, 11)
Category of Column :  5.0


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.22it/s]

***********Merge_df**********
(174354, 13)



  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Category of Column :  1


 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.49it/s]

Category of Column :  0


 67%|████████████████████████████████████████████████████████                            | 2/3 [00:01<00:00,  1.28it/s]

***********Merge_df**********
(174460, 5)
Category of Column :  2


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.34it/s]

***********Merge_df**********
(179986, 7)





Shape of dataframe :  (179986, 45)
Creating features in 4th round...
Shape of dataframe :  (179986, 50)


In [65]:
pickle.dump(new_aggr_feat,open("4th_Iteration_features/new_aggr_feat.pkl","wb"))

In [7]:
hist_aggr_feat = pickle.load(open("4th_Iteration_features/hist_aggr_feat.pkl","rb"))
new_aggr_feat = pickle.load(open("4th_Iteration_features/new_aggr_feat.pkl","rb")) 

In [8]:
hist_aggr_feat.columns

Index(['hist_card_id', 'hist_purchase_date_max', 'hist_purchase_date_min',
       'hist_transactions_count', 'hist_new_month_lag',
       'hist_new_purchase_date', 'hist_observation_date',
       'hist_category_1_mean', 'hist_installments_sum',
       'hist_installments_mean', 'hist_installments_max', 'hist_card_id_count',
       'hist_card_id_size', 'hist_new_purchase_amount_sum_x',
       'hist_new_purchase_amount_perc_75', 'hist_new_purchase_amount_perc_25',
       'hist_new_purchase_amount_mean', 'hist_new_purchase_amount_head_sum',
       'hist_new_purchase_amount_head_max',
       'hist_new_purchase_amount_tail_sum',
       'hist_new_purchase_amount_tail_max', 'hist_new_purchase_amount_gmean',
       'hist_new_purchase_amount_hmean', 'hist_category_1_0_mean',
       'hist_category_1_0_sum', 'hist_category_1_1_mean',
       'hist_category_1_1_sum', 'hist_category_2_3.0_mean',
       'hist_category_2_3.0_sum', 'hist_category_2_1.0_mean',
       'hist_category_2_1.0_sum', 'hist_cate

In [9]:
new_aggr_feat.columns

Index(['new_card_id', 'new_purchase_date_max', 'new_purchase_date_min',
       'new_transactions_count', 'new_new_month_lag', 'new_new_purchase_date',
       'new_observation_date', 'new_category_1_mean', 'new_installments_sum',
       'new_installments_mean', 'new_installments_max', 'new_card_id_count',
       'new_card_id_size', 'new_new_purchase_amount_sum_x',
       'new_new_purchase_amount_perc_75', 'new_new_purchase_amount_perc_25',
       'new_new_purchase_amount_mean', 'new_new_purchase_amount_head_sum',
       'new_new_purchase_amount_head_max', 'new_new_purchase_amount_tail_sum',
       'new_new_purchase_amount_tail_max', 'new_new_purchase_amount_gmean',
       'new_new_purchase_amount_hmean', 'new_category_1_1_mean',
       'new_category_1_1_sum', 'new_category_1_0_mean', 'new_category_1_0_sum',
       'new_category_2_nan_mean', 'new_category_2_nan_sum',
       'new_category_2_1.0_mean', 'new_category_2_1.0_sum',
       'new_category_2_3.0_mean', 'new_category_2_3.0_sum',
  

In [10]:
hist_datetime_feat = hist_aggr_feat.select_dtypes('datetime')
new_datetime_feat = new_aggr_feat.select_dtypes('datetime')

In [13]:
hist_aggr_feat = hist_aggr_feat.drop(hist_datetime_feat,axis=1)
new_aggr_feat = new_aggr_feat.drop(new_datetime_feat,axis=1)

In [37]:
drop_cols = hist_aggr_feat.select_dtypes('O').columns

In [38]:
hist_features = hist_aggr_feat.drop(['hist_card_id'] + list(drop_cols),axis=1)

In [16]:
print("Number of hist trans features : ",hist_aggr_feat.shape[1])
print("Number of new trans features : ",new_aggr_feat.shape[1]) 

Number of hist trans features :  44
Number of new trans features :  46


In [53]:
## Some feature interaction between historic and new transactions
#del(total_aggr_df)
rat_prod_aggr_df = pd.DataFrame({'card_id' : hist_aggr_feat['hist_card_id']})

for col in hist_features:
    print("Creating feature interaction for : ",col)
    col_name = list(col.split('_'))
    col_name[0] = 'new'
    new_col_name = '_'.join(col_name)
    #col_name[0] = 'total'
    #total_col_name = '_'.join(col_name)
    col_name[0] = 'ratio'
    ratio_col_name = '_'.join(col_name)
    col_name[0] = 'prod'
    prod_col_name = '_'.join(col_name)
    
    #total_aggr_df[total_col_name] = hist_aggr_feat[col] + new_aggr_feat[new_col_name]
    rat_prod_aggr_df[ratio_col_name] = new_aggr_feat[new_col_name]/hist_aggr_feat[col]
    rat_prod_aggr_df[prod_col_name] = new_aggr_feat[new_col_name]*hist_aggr_feat[col]

Creating feature interaction for :  hist_transactions_count
Creating feature interaction for :  hist_new_month_lag
Creating feature interaction for :  hist_category_1_mean
Creating feature interaction for :  hist_installments_sum
Creating feature interaction for :  hist_installments_mean
Creating feature interaction for :  hist_installments_max
Creating feature interaction for :  hist_card_id_count
Creating feature interaction for :  hist_card_id_size
Creating feature interaction for :  hist_new_purchase_amount_sum_x
Creating feature interaction for :  hist_new_purchase_amount_perc_75
Creating feature interaction for :  hist_new_purchase_amount_perc_25
Creating feature interaction for :  hist_new_purchase_amount_mean
Creating feature interaction for :  hist_new_purchase_amount_head_sum
Creating feature interaction for :  hist_new_purchase_amount_head_max
Creating feature interaction for :  hist_new_purchase_amount_tail_sum
Creating feature interaction for :  hist_new_purchase_amount_ta

In [50]:
total_aggr_df.head()

Unnamed: 0,card_id,ratio_transactions_count,prod_transactions_count,ratio_new_month_lag,prod_new_month_lag,ratio_category_1_mean,prod_category_1_mean,ratio_installments_sum,prod_installments_sum,ratio_installments_mean,...,ratio_new_purchase_amount_min,prod_new_purchase_amount_min,ratio_new_purchase_amount_max,prod_new_purchase_amount_max,ratio_new_purchase_amount_median,prod_new_purchase_amount_median,ratio_new_purchase_amount_sum_y,prod_new_purchase_amount_sum_y,ratio_new_purchase_amount_std,prod_new_purchase_amount_std
0,C_ID_5037ff576e,,,inf,0.0,inf,0.0,0.290323,4464.0,1.557185,...,1.392228,572.59314,inf,inf,1.33455,7032.155762,inf,inf,,
1,C_ID_0e171c1b48,,,inf,0.0,,0.0,3.0,27.0,418.5,...,189.75,189.75,1.003333,90300.0,19.629999,3067.1875,0.066808,3604892.5,1.785139,3468.481445
2,C_ID_fc8e41b9cf,,,inf,0.0,0.0,0.0,0.0,0.0,0.0,...,1.9,190.0,inf,inf,2.382911,14871.75,inf,inf,,
3,C_ID_b271e7ab60,,,inf,0.0,,0.0,0.0,0.0,0.0,...,3.77907,69.875,inf,inf,1.817271,5475.277344,inf,inf,,
4,C_ID_4bed29d75c,,,inf,0.0,0.0,0.0,0.0,0.0,0.0,...,3.25,3.25,0.378598,164917.5,4.364549,975.487488,0.388529,5015884.0,0.78751,7987.800293


In [47]:
pickle.dump(total_aggr_df,open("4th_Iteration_features/total_feat.pkl","wb"))

In [48]:
total_aggr_df.shape

(201917, 42)

In [54]:
pickle.dump(rat_prod_aggr_df,open("4th_Iteration_features/rat_prod_aggr_feat.pkl","wb"))

## Working on creating remaining features

In [3]:
temp_merch_df = hist_train_preprocessed_obj[1]
temp_merch_df.head()

Unnamed: 0,card_id,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,C_ID_5037ff576e,M_ID_b61c7d1be0,91632.0,278.0,37.0,-0.057465,-0.057465,N,D,D,...,0.88,0.934748,6.0,0.89,0.91968,12.0,N,-1.0,11.0,3.0
1,C_ID_5037ff576e,M_ID_fe69229f24,56893.0,307.0,19.0,0.011932,0.011932,N,B,C,...,0.81,0.951783,6.0,0.8,0.947394,12.0,N,-1.0,15.0,1.0
2,C_ID_5037ff576e,M_ID_efc106141c,21026.0,705.0,33.0,-0.057465,-0.057465,N,D,D,...,1.06,0.998241,6.0,1.1,1.021108,12.0,N,-1.0,15.0,1.0
3,C_ID_5037ff576e,M_ID_708022307c,16621.0,307.0,19.0,0.120972,-0.057465,N,A,B,...,0.84,1.029186,6.0,0.8,1.014457,12.0,N,-1.0,16.0,1.0
4,C_ID_5037ff576e,M_ID_393b4b8cec,35.0,705.0,33.0,3.689453,3.630859,N,B,B,...,1.01,1.048611,6.0,1.02,1.026715,12.0,N,-1.0,17.0,3.0


In [4]:
temp_merch_df.shape

(18786148, 23)

In [6]:
del(hist_train_preprocessed_obj)
gc.collect() 

15279

In [8]:
new_train_preprocessed_obj = pickle.load(open("Train_Test_generated_features/new_train_preprocessed_obj.pkl","rb"))
new_temp_merch_df = new_train_preprocessed_obj[1]
new_temp_merch_df.shape

(1223315, 23)

In [9]:
del(new_train_preprocessed_obj)
gc.collect()

20481

In [20]:
temp_merch_df['category_4'].fillna('Y',inplace=True)

In [None]:
label_encoded_df = labelEncodedFeatures(temp_merch_df[['category_4']])
temp_merch_df['category_4'] = label_encoded_df['category_4']

In [26]:
def createMerchRelatedFeat(temp_merch_df,suffix):
    merch_aggr_df = pd.DataFrame({'card_id' : temp_merch_df.card_id.unique()})
    merch_aggr_df['merchant_id_nunique'] = temp_merch_df.groupby(['card_id'])['merchant_id'].apply(lambda x : x.nunique())
    merch_aggr_df['merchant_category_id_nunique'] = temp_merch_df.groupby(['card_id'])['merchant_category_id'].apply(lambda x : x.nunique())
    merch_aggr_df['category_4_mean'] = temp_merch_df.groupby(['card_id'])['category_4'].apply(lambda x : x.mean())
    merch_aggr_df['numerical_1_mean'] = temp_merch_df.groupby(['card_id'])['numerical_1'].apply(lambda x : x.mean())
    merch_aggr_df['numerical_2_mean'] = temp_merch_df.groupby(['card_id'])['numerical_2'].apply(lambda x : x.mean())
    merch_aggr_df.columns = [suffix + "_" + col for col in merch_aggr_df.columns]
    return merch_aggr_df

In [24]:
new_temp_merch_df['category_4'].fillna('Y',inplace=True)
label_encoded_df = labelEncodedFeatures(new_temp_merch_df[['category_4']])
new_temp_merch_df['category_4'] = label_encoded_df['category_4'] 

Column considered :  category_4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [27]:
print("Creating features for hist_temp_merch_df...")
hist_merch_feat = createMerchRelatedFeat(temp_merch_df,'hist')
print("Creating features for new_temp_merch_df...") 
new_merch_feat = createMerchRelatedFeat(new_temp_merch_df,'new')

Creating features for hist_temp_merch_df...
Creating features for new_temp_merch_df...


In [28]:
hist_merch_feat.head()

Unnamed: 0,hist_card_id,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_category_4_mean,hist_numerical_1_mean,hist_numerical_2_mean
0,C_ID_5037ff576e,,,,,
1,C_ID_0e171c1b48,,,,,
2,C_ID_fc8e41b9cf,,,,,
3,C_ID_b271e7ab60,,,,,
4,C_ID_4bed29d75c,,,,,


In [29]:
new_merch_feat.head()

Unnamed: 0,new_card_id,new_merchant_id_nunique,new_merchant_category_id_nunique,new_category_4_mean,new_numerical_1_mean,new_numerical_2_mean
0,C_ID_ef55cf8d4b,,,,,
1,C_ID_446027a629,,,,,
2,C_ID_fb7560dfea,,,,,
3,C_ID_3631eda7b9,,,,,
4,C_ID_9571a629f5,,,,,
