In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

from collections import Counter
import warnings
warnings.filterwarnings('ignore')

target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1',
               'ind_ctju_fin_ult1','ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1',
               'ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1',
               'ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1',
               'ind_viv_fin_ult1','ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1']

### Feature Engineering

In [11]:
# read data in
train = pd.read_csv("filtered_train_w_lag.csv")
test  = pd.read_csv("test_ver2.csv")
print(train.info(memory_usage=True))
print(test.info(memory_usage=True))



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3993880 entries, 0 to 3993879
Data columns (total 48 columns):
fecha_dato               object
ncodpers                 int64
ind_empleado             object
pais_residencia          object
sexo                     object
age                      object
fecha_alta               object
ind_nuevo                float64
antiguedad               object
indrel                   float64
ult_fec_cli_1t           object
indrel_1mes              object
tiprel_1mes              object
indresi                  object
indext                   object
conyuemp                 object
canal_entrada            object
indfall                  object
tipodom                  float64
cod_prov                 float64
nomprov                  object
ind_actividad_cliente    float64
renta                    float64
segmento                 object
ind_ahor_fin_ult1        int64
ind_aval_fin_ult1        int64
ind_cco_fin_ult1         int64
ind_cder_fin_ult1    

In [12]:
# it seems that some columns have mixed dtype. We need to unify everything
print(Counter(train.fecha_dato))
print(Counter(test.fecha_dato))

train_rows = train.shape[0]
print(train.shape)
test_rows = test.shape[0]
print(test.shape)

trainX = train.iloc[:,:24]
trainY = train.iloc[:,24:]
print(trainX.shape)

Counter({'2016-05-28': 931453, '2015-11-28': 906109, '2015-10-28': 892251, '2015-06-28': 632110, '2015-05-28': 631957})
Counter({'2016-06-28': 929615})
(3993880, 48)
(929615, 24)
(3993880, 24)


In [13]:
# combine train and test dataset together for preprocessing and cleaning
combined = pd.concat([trainX, test], axis=0)
print("The shape is ", combined.shape)

# the dtype of some columns are not right, correct them, however, you can't convert nan to int in pandas, so use float
combined.age = pd.to_numeric(combined.age, errors='coerce')
combined.renta = pd.to_numeric(combined.renta, errors='coerce')
combined.antiguedad = pd.to_numeric(combined.antiguedad, errors='coerce')
combined.indrel_1mes = pd.to_numeric(combined.indrel_1mes, errors='coerce')

# the 'conyuemp' column is empty at 100000, so remove it
#combined.drop('conyuemp', 1, inplace=True)

# tipodom doesn't seem to be useful, so I am going to drop it
#combined.drop('tipodom', 1, inplace=True)

# As 99% of 'ult_fec_cli_1t' information missing, so I am going to drop it
#combined.drop('ult_fec_cli_1t', 1, inplace=True)

# as we have both cod_prov and nomprov, they represent the same thing. so drop nomprov
combined.drop('nomprov', 1, inplace=True)

# separate numeric from categorical data
combined_num = combined.select_dtypes(exclude=['object'])
combined_cat = combined.select_dtypes(include=['object'])

del combined

The shape is  (4923495, 24)


### Handle Missing Data

In [14]:
### For AGE ###
# for missing age value
combined_num.age.fillna(39, inplace=True)

# replace all the age < 18 to ave of 18-30 and > 90 with ave of 30-90
combined_num.loc[combined_num.age < 18, 'age'] = 23
combined_num.loc[combined_num.age > 90, 'age'] = 50
combined_num.age = combined_num.age.astype(int)

#### for ind_nuevo (New customer)
combined_num['ind_nuevo'].fillna(1.0, inplace=True)

#### for antiguedad (antiquity or Seniority), use the 25 percentile, which is 23
combined_num['antiguedad'].fillna(23, inplace=True)

#### for 'indrel' column, which indicates
# 1 (First/Primary), 99 (Primary customer during the month but not at the end of the month)
# use the most common one which is 1
combined_num['indrel'].fillna(1.0, inplace=True)

#### for 'indrel_1mes' column
# As suggested by @StephenSmith
map_dict = { 1.0  : "1",
            "1.0" : "1",
            "1"   : "1",
            "3.0" : "3",
            "P"   : "5",
            3.0   : "3",
            2.0   : "2",
            "3"   : "3",
            "2.0" : "2",
            4.0   : "4",
            "4"   : "4",
            "2"   : "2"}

combined_num.indrel_1mes.fillna("P",inplace=True)
combined_num.indrel_1mes = combined_num.indrel_1mes.apply(lambda x: map_dict.get(x,x))
combined_num.indrel_1mes = combined_num.indrel_1mes.astype("category")
print(Counter(combined_num.indrel_1mes))

#### for tipodom: (Addres type. 1, primary address), 
# since this one doesn't provide any useful information, so will drop it eventually
combined_num['tipodom'].fillna(1.0, inplace=True)

#### for cod_prov: provincial code for the address, use '0' for the unknown code
combined_num['cod_prov'].fillna(0, inplace=True)

#### ind_actividad_cliente  (Ind_activity customer), use '2.0 for the unknown value
combined_num['ind_actividad_cliente'].fillna(2.0, inplace=True)

#### renta (rental), assign them region by region
rental_dict = {0.0: 190864.28134387353, 1.0: 116454.21824999999, 2.0: 83059.639880275121, 3.0: 87347.808423990966, 
               4.0: 85374.23584159567, 5.0: 76816.518459868807, 6.0: 72176.18582748514, 7.0: 171995.86848114064, 
               8.0: 164672.39357774809, 9.0: 97878.796620592184, 10.0: 75372.170251322314, 11.0: 98648.098741735899, 
               12.0: 79174.239231086744, 13.0: 69896.499221334176, 14.0: 85622.935385531295, 15.0: 112788.86299594035, 
               16.0: 69949.635891681086, 17.0: 144340.73909940803, 18.0: 96553.136598167126, 19.0: 95550.728265982965, 
               20.0: 139632.71175824184, 21.0: 76687.208433018008, 22.0: 89229.944358405774, 23.0: 77132.018859586417, 
               24.0: 93387.127306748385, 25.0: 81230.193312456133, 26.0: 99642.268904048149, 27.0: 76675.52248979923, 
               28.0: 178865.27951727214, 29.0: 121200.83724370257, 30.0: 79075.0041727278, 31.0: 105811.42276119406, 
               32.0: 83299.617043690712, 33.0: 101398.81625954412, 34.0: 92783.771758008908, 35.0: 100323.19958599037, 
               36.0: 113524.32363071061, 37.0: 105792.44947409695, 38.0: 102717.47302208183, 39.0: 121197.46123931887, 
               40.0: 98489.164981292226, 41.0: 117422.48873751337, 42.0: 88050.127969305482, 43.0: 104578.57903186574, 
               44.0: 87686.596356093884, 45.0: 80594.888512939171, 46.0: 89768.963794916985, 47.0: 101652.21498126048, 
               48.0: 110186.07268421052, 49.0: 83348.77014432376, 50.0: 110563.52870700191, 51.0: 199147.44231460703, 
               52.0: 149861.65298934001}

for pcode in rental_dict.keys():
    # fetch rows that are within the pcode and with 'renta' value is np.nan
    idx = combined_num.loc[combined_num.cod_prov.isin([pcode]) & combined_num.renta.isnull(), 'renta'].index.tolist()
    if idx:
        #print(idx)
        combined_num.ix[idx, 'renta'] = rental_dict[pcode]

### For categorical columns ###

### For fecha_alta (joined date)
# Some entries don't have the date they joined the company. Just give them something in the middle of the pack
combined_cat.loc[combined_cat.fecha_alta.isnull(),"fecha_alta"] = '2011-08-31'

#### For conyuemp, assigned an unknown value
combined_cat['conyuemp'].fillna('U', inplace=True)

#### For ind_empleado (employed or employment), I will assign the most common one 'N'
print(Counter(combined_cat.ind_empleado))
combined_cat.loc[combined_cat.ind_empleado.isnull(), 'ind_empleado'] = 'N'

#### For pais_residencia (Country of residency), use the most common one: 'ES'
combined_cat.loc[combined_cat.pais_residencia.isnull(), 'pais_residencia'] = 'ES'

#### For sexo, use unknown category
combined_cat.loc[combined_cat.sexo.isnull(), 'sexo'] = 'U'

#### for ult_fec_cli_1t
combined_cat.loc[combined_cat.ult_fec_cli_1t.isnull(), 'ult_fec_cli_1t'] = '2011-01-11'

#### For indfall, use the most common one
combined_cat.loc[combined_cat.indfall.isnull(), 'indfall'] = 'N'

#### For tiprel_1mes, use an unknown category
combined_cat.loc[combined_cat.tiprel_1mes.isnull(), 'tiprel_1mes'] = 'U'

#### For indresi, use the most common one
combined_cat.loc[combined_cat.indresi.isnull(), 'indresi'] = 'S'

#### For indext, use the most common one
combined_cat.loc[combined_cat.indext.isnull(), 'indext'] = 'N'

#### For canal_entrada (input channel), use an unknown one
combined_cat.loc[combined_cat.canal_entrada.isnull(), 'canal_entrada'] = 'AAA'
print(Counter(combined_cat.canal_entrada))

#### For segmento (segment), use the unknown one
combined_cat.loc[combined_cat.segmento.isnull(), 'segmento'] = '04 - unknown'

### For trainY
#### For ind_nomina_ult1 and ind_nom_pens_ult1, they only in training dataset. 
# Here I will assign them with the most common value, which is '0'
trainY.loc[trainY.ind_nomina_ult1.isnull(), 'ind_nomina_ult1'] = 0
trainY.loc[trainY.ind_nom_pens_ult1.isnull(), 'ind_nom_pens_ult1'] = 0

Counter({'1': 4871391, '5': 50353, '3': 1278, '2': 383, '4': 90})
Counter({'N': 4914996, nan: 5458, 'B': 1262, 'F': 893, 'A': 880, 'S': 6})
Counter({'KHE': 1433618, 'KAT': 1156496, 'KFC': 1098407, 'KHQ': 255253, 'KFA': 145262, 'KHK': 90674, 'KHM': 90368, 'AAA': 62617, 'KHN': 55324, 'KHD': 41370, 'KAS': 30582, 'RED': 28942, 'KAG': 26290, 'KAY': 23912, 'KAA': 23623, 'KAB': 22105, 'KAE': 17993, 'KCC': 17454, 'KHL': 16975, 'KBZ': 16466, 'KFD': 15788, 'KAI': 13349, 'KEY': 12458, 'KAW': 12146, 'KAR': 11575, 'KAZ': 11417, 'KAF': 10806, '007': 10463, '013': 9603, 'KCI': 9433, 'KAH': 8807, 'KAJ': 8642, 'KCH': 8571, 'KHF': 7410, 'KAQ': 6399, 'KHC': 5846, 'KAP': 5317, 'KHO': 4356, 'KAM': 4007, 'KAD': 3782, 'KFP': 3366, 'KGX': 3362, 'KEJ': 3288, 'KGV': 3172, 'KDR': 2858, 'KFT': 2855, 'KAC': 2727, 'KAL': 2691, 'KBO': 2623, 'KBH': 2553, 'KFG': 2412, 'KFS': 2388, 'KAO': 2372, 'KFJ': 2355, 'KES': 2088, 'KEW': 2014, 'KFF': 1967, 'KCG': 1913, 'KCB': 1837, 'KFU': 1750, 'KEN': 1742, 'KFN': 1607, 'KCL': 14

In [15]:
print("For categorical datan\n", combined_cat.isnull().any())
print("\nFor numeric data:\n", combined_num.isnull().any())
print("\nFor Y training data\n", trainY.isnull().any())
#print("\nshape of the combined data\n", combined.shape)

For categorical datan
 fecha_dato         False
ind_empleado       False
pais_residencia    False
sexo               False
fecha_alta         False
ult_fec_cli_1t     False
tiprel_1mes        False
indresi            False
indext             False
conyuemp           False
canal_entrada      False
indfall            False
segmento           False
dtype: bool

For numeric data:
 ncodpers                 False
age                      False
ind_nuevo                False
antiguedad               False
indrel                   False
indrel_1mes              False
tipodom                  False
cod_prov                 False
ind_actividad_cliente    False
renta                    False
dtype: bool

For Y training data
 ind_ahor_fin_ult1    False
ind_aval_fin_ult1    False
ind_cco_fin_ult1     False
ind_cder_fin_ult1    False
ind_cno_fin_ult1     False
ind_ctju_fin_ult1    False
ind_ctma_fin_ult1    False
ind_ctop_fin_ult1    False
ind_ctpp_fin_ult1    False
ind_deco_fin_ult1    False
ind_de

### Now convert categorical data to numeric

In [16]:
for cat in combined_cat.columns:
    combined_cat[cat], unique = pd.factorize(combined_cat[cat])
    if cat != 'fecha_alta':
        print("for ", cat, "\n", Counter(combined_cat[cat]))

for  fecha_dato 
 Counter({4: 931453, 5: 929615, 3: 906109, 2: 892251, 1: 632110, 0: 631957})
for  ind_empleado 
 Counter({0: 4920454, 2: 1262, 3: 893, 1: 880, 4: 6})
for  pais_residencia 
 Counter({0: 4900199, 8: 1824, 11: 1705, 7: 1639, 9: 1633, 14: 1298, 19: 1246, 25: 1041, 40: 1036, 10: 910, 17: 830, 13: 819, 18: 767, 1: 711, 12: 539, 59: 536, 20: 505, 27: 505, 2: 350, 26: 317, 3: 270, 41: 270, 64: 267, 29: 212, 34: 211, 30: 198, 72: 180, 42: 174, 4: 169, 46: 168, 6: 158, 15: 150, 68: 150, 52: 146, 5: 144, 28: 140, 35: 122, 23: 100, 36: 86, 66: 85, 75: 80, 39: 78, 65: 74, 24: 66, 21: 52, 45: 48, 43: 46, 57: 44, 73: 42, 76: 42, 62: 40, 78: 40, 50: 36, 51: 36, 63: 36, 80: 36, 37: 34, 69: 34, 86: 34, 79: 31, 58: 30, 70: 30, 81: 30, 54: 28, 56: 28, 47: 24, 61: 24, 71: 24, 77: 24, 93: 24, 22: 22, 67: 22, 16: 19, 38: 18, 44: 18, 48: 18, 55: 18, 94: 18, 99: 18, 101: 18, 49: 16, 104: 16, 106: 16, 31: 12, 32: 12, 53: 12, 60: 12, 89: 12, 90: 12, 92: 12, 100: 12, 103: 12, 105: 8, 33: 6, 74: 6

### join them together and then split them up according to training and testing data

In [23]:
joined = pd.concat([combined_num, combined_cat], axis=1)
print("joined shape is: ", joined.shape, " and the type is ", type(joined))
trainX = joined.iloc[:train_rows,:]
testX  = joined.iloc[train_rows:,:]
testX.to_csv("testX.csv", index=False)

print("Training X shape: ", trainX.shape)
print("Testing X shape: ", testX.shape)
print("Training Y shape: ", trainY.shape)

# now I need to combine trainX and trainY together
combined = pd.concat([trainX, trainY], axis=1)
print(combined.shape)

joined shape is:  (4923495, 23)  and the type is  <class 'pandas.core.frame.DataFrame'>
Training X shape:  (3993880, 23)
Testing X shape:  (929615, 23)
Training Y shape:  (3993880, 24)
(3993880, 47)


#### Here I am going to find out those people who have added/removed their services
- Counter({'2016-05-28': 931453, '2015-11-28': 906109, '2015-10-28': 892251, '2015-06-28': 632110, '2015-05-28': 631957})
- Counter({4: 931453, 3: 906109, 2: 892251, 1: 632110, 0: 631957})
  - 0 ==> 2015-05-28;  1 ==> 2015-06-28;  2 ==> 2015-10-28; 3 ==> 2015-11-28; 4 ==> 2016-05-28;  5 ==> 2016-06-28
- Counter({'2016-06-28': 929615})

In [38]:
# this function is used to calculate the union and the differences between two dataframes
def get_union_difference(df1, df2, df2_copy):   # df2 is date behind df1
    # get customer ids from each dataframe
    df1_customers = set(df1['ncodpers'])
    df2_customers = set(df2['ncodpers'])
    print("entering the function call with customer ids count is ", len(df2_customers))
    
    # get the commone unions and then extract them out from each dataframe
    common_ids = df1_customers & df2_customers
    df1 = df1[df1.ncodpers.isin(common_ids)]
    df2 = df2[df2.ncodpers.isin(common_ids)]
    print("common shape is ", df2.shape)
    
    # now find the difference and extract the unique ones out
    df2_unique_ids = df2_customers - df1_customers
    df2_copy = df2_copy[df2_copy.ncodpers.isin(df2_unique_ids)]
    print("unique shape is ", df2_copy.shape)
    
    return df1, df2, df2_copy

In [39]:
# for 2015
train2015_05_28 = combined[combined.fecha_dato.isin([0])]
train2015_05_28_lag = train2015_05_28.copy()
print(train2015_05_28.shape)

train2015_06_28 = combined[combined.fecha_dato.isin([1])]
train2015_06_28_unique = train2015_06_28.copy()
train2015_06_28_lag = train2015_06_28.copy()
print(train2015_06_28.shape)

train2015_10_28 = combined[combined.fecha_dato.isin([2])]
train2015_10_28_lag = train2015_10_28.copy()
print(train2015_10_28.shape)

train2015_11_28 = combined[combined.fecha_dato.isin([3])]
train2015_11_28_lag = train2015_11_28.copy()
print(train2015_11_28.shape)

train2016_05_28 = combined[combined.fecha_dato.isin([4])]
train2016_05_28.to_csv("train2016_05_28.csv", index=False)
print(train2016_05_28.shape)

# get union and differences between two dataframes
train2015_05_28, train2015_06_28, train2015_06_28_unique = get_union_difference(train2015_05_28, train2015_06_28, train2015_06_28_unique)
train2015_05_28.to_csv("train2015_05_28.csv", index=False)
train2015_06_28.to_csv("train2015_06_28.csv", index=False)
train2015_06_28_unique.to_csv("train2015_06_28_unique.csv", index=False)

(631957, 47)
(632110, 47)
(892251, 47)
(906109, 47)
(931453, 47)
entering the function call with customer ids count is  632110
common shape is  (628603, 47)
unique shape is  (3507, 47)


### For lagging analysis
- use 2015-06-28 against 2015-11-28 (5 month-lags)

In [40]:
# calculate the dataframe used for lagging time series

# for 2015-06-28 to 2015-11-28
train2015_11_28_lag_unique = train2015_11_28_lag.copy()
train2015_06_28_lag, train2015_11_28_lag, train2015_11_28_lag_unique = get_union_difference(
    train2015_06_28_lag, train2015_11_28_lag, train2015_11_28_lag_unique)
train2015_06_28_lag.to_csv("train2015_06_28.csv", index=False)
train2015_11_28_lag.to_csv("train2015_11_28.csv", index=False)
train2015_11_28_lag_unique.to_csv("train2015_11_28_lag_unique.csv", index=False)

# for 2015=05-28 to 2015-10-28
train2015_10_28_lag_unique = train2015_10_28_lag.copy()
train2015_05_28_lag, train2015_10_28_lag, train2015_10_28_lag_unique = get_union_difference(
    train2015_05_28_lag, train2015_10_28_lag, train2015_10_28_lag_unique)
train2015_05_28_lag.to_csv("train2015_05_28_lag.csv", index=False)
train2015_10_28_lag.to_csv("train2015_10_28_lag.csv", index=False)
train2015_10_28_lag_unique.to_csv("train2015_10_28_lag_unique.csv", index=False)

entering the function call with customer ids count is  906109
common shape is  (626956, 47)
unique shape is  (279153, 47)
entering the function call with customer ids count is  892251
common shape is  (626036, 47)
unique shape is  (266215, 47)


#### Need to find out whose services are changed in details

In [41]:
def customer_w_service_changed(df1, df2, num):
    one = df1.sort_values(by='ncodpers').reset_index(drop=True).set_index('ncodpers')[target_cols]
    two = df2.sort_values(by='ncodpers').reset_index(drop=True).set_index('ncodpers')[target_cols]
    print("The final shape is ", one.shape)
    
    differences = two - one
    differences[differences < 0] = 0   # reset to 0, so we only consider those with added services

    differences["num_new_product"] = differences.sum(axis=1)
    differences = differences.loc[differences.num_new_product > 0]
    print(differences.shape)
    
    accounts_with_new_products = differences.index   # because the index was reset to ncodpers
    print("Accounts have been changed: ", len(accounts_with_new_products))

    np.save("changed_ids"+num, accounts_with_new_products)
    return accounts_with_new_products

In [42]:
changed_june = customer_w_service_changed(train2015_05_28, train2015_06_28, '0506')
changed_oct  = customer_w_service_changed(train2015_05_28_lag, train2015_10_28_lag, '0510')
changed_nov  = customer_w_service_changed(train2015_06_28_lag, train2015_11_28_lag, '0611')

The final shape is  (628603, 24)
(33318, 25)
Accounts have been changed:  33318
The final shape is  (626036, 24)
(53655, 25)
Accounts have been changed:  53655
The final shape is  (626956, 24)
(45006, 25)
Accounts have been changed:  45006


In [17]:
# for changed ones:
changed_ids = []
with open("changed_ids_all.txt", 'r') as f:
    for i in f:
        changed_ids.append(int(i.strip()))
print("changed id length is: ", len(changed_ids), " with ", changed_ids[2:8])

train2015_05_28_changed = train2015_05_28[train2015_05_28.ncodpers.isin(changed_ids)]
train2015_06_28_changed = train2015_06_28[train2015_06_28.ncodpers.isin(changed_ids)]
print(train2015_05_28_changed.shape)
print(train2015_06_28_changed.shape)

# save them
train2015_05_28_changed.to_csv("train2015_05_28_changed.csv", index=False)
train2015_06_28_changed.to_csv("train2015_06_28_changed.csv", index=False)

changed id length is:  51489  with  [1048830, 1048863, 1048902, 1048926, 1048957, 1049016]
(51489, 47)
(51489, 47)


In [None]:
# need to find out those customers who changed services betweeen 2015-05-28 to 2015-06-28
# to use this approach, I need to make sure their index are the same
from pandas.util.testing import assert_frame_equal

changed_status_55 = []
changed_status_56 = []

same55 = []

train_common_customers = list(train_common_customers)

for id in range(len(train_common_customers)):
    customer = train_common_customers[id]
    #print(customer)
    
    all_2015_05 = may_data[may_data.index == customer]
    all_2015_06 = june_data[june_data.index == customer]
    #print(all_2015_05.shape[0])

    # for services changed
    for idx1, row1 in all_2015_05.iterrows():
        for idx2, row2 in all_2015_06.iterrows():
            tmp = [max(abs(int(x1) - int(x2)),0) for (x1, x2) in zip(row1, row2)]
            if sum(tmp) == 0:
                row1 = row1.tolist()
                row1.append(idx1)
                same55.append(row1)
                
#    try:
#        assert_frame_equal(all_2015_05, all_2015_06)
#    except:
        #changed_status_55 = pd.concat([changed_status_55, all_2015_05])
        #changed_status_56 = pd.concat([changed_status_56, all_2015_06])
#        changed_status_55.append(customer)
#        changed_status_56.append(customer)
        
#print("2015-05 total remaining: ", len(changed_status_55))
#print("2015-06 total remaining: ", len(changed_status_56))

print("2015-05 same list: ", len(same55))
np.save("same55", same55)

In [44]:
#np.save("changed_status_55", changed_status_55)
#np.save("changed_status_56", changed_status_56)
print(type(same55))
#print(same55[3:10])
# [15906, 15925, 15927, 15964, 15982, 16026, 16043, 16056] 1048581.0, 1048582.0, 1048583.0, 1048585.0, 1048584.0

all_2015_05 = may_data[may_data.index == 1048582.0]
all_2015_06 = june_data[june_data.index == 1048582.0]
print(all_2015_05.shape[0])

# for services changed
for idx1, row1 in all_2015_05.iterrows():
    #print("id 1 is: ", idx1, "\n", row1)
    for idx2, row2 in all_2015_06.iterrows():
        #print("id2 is: ", idx2, "\n", row2)
        #tmp = [max(abs(int(x1) - int(x2)),0) for (x1, x2) in zip(row1, row2)]
        [print(int(x1), " and ", int(x2)) for (x1, x2) in zip(row1, row2)]
        if sum(tmp) == 0:
            row1 = row1.tolist()
            row1.append(idx1)
            same55.append(row1)
            #print(row1)
            #abc = row1.pop(-1)
            #print(abc, " and ", row1)

<class 'list'>
1
0  and  0
0  and  0
1  and  1
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
0  and  0
same
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1048582]
1048582  and  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [142]:
#print(train2015_05_28.iloc[:,24:].head(2))
#print(train2015_05_28.head())
print(train2015_05_28.loc[train2015_05_28.ncodpers == 1048576].iloc[:,24:])
print(train2015_05_28.loc[train2015_05_28.ncodpers == 1061283].shape)
#print(train2015_05_28.head().iloc[:,24:])

       ind_aval_fin_ult1  ind_cco_fin_ult1  ind_cder_fin_ult1  \
34226                  0                 1                  0   

       ind_cno_fin_ult1  ind_ctju_fin_ult1  ind_ctma_fin_ult1  \
34226                 0                  0                  0   

       ind_ctop_fin_ult1  ind_ctpp_fin_ult1  ind_deco_fin_ult1  \
34226                  0                  0                  0   

       ind_deme_fin_ult1       ...         ind_hip_fin_ult1  \
34226                  0       ...                        0   

       ind_plan_fin_ult1  ind_pres_fin_ult1  ind_reca_fin_ult1  \
34226                  0                  0                  0   

       ind_tjcr_fin_ult1  ind_valo_fin_ult1  ind_viv_fin_ult1  \
34226                  0                  0                 0   

       ind_nomina_ult1  ind_nom_pens_ult1  ind_recibo_ult1  
34226              0.0                0.0                0  

[1 rows x 23 columns]
(1, 47)
