In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('TransactionRecord-v-return added.csv')

**Filtering Out Missing Data**

In [3]:
df=df.dropna()

**Checking Duplicates**

In [4]:
sum(df.duplicated())#no duplicates found

0

**Rename the columns(for ease of manipulation)**

In [5]:
df=df.rename(columns={'trd_date':'Date','fund_id':'FID1',
                  'unit_count':'Units','gender':'Gender',
                  'TrailerRate':'TRate','FundRiskScore':'FRSC','FundName':'FName'})

**Drop Unnecessary Columns**

In [6]:
del df['FName']

In [7]:
del df['intermediaryid']

In [8]:
df.columns

Index(['Unnamed: 0', 'Date', 'FID1', 'investor_id', 'Units', 'Gender',
       'income', 'education', 'riskscore', 'subscribed', 'opened', 'SCRate',
       'TRate', 'FRSC', 'Price', 'return-N6M', 'return-HYG', 'return-IVV'],
      dtype='object')

**Create a separate table, create dummy variables for Fund Type**

In [9]:
dummies_F1D1=pd.get_dummies(df['FID1'],prefix='FID1')

In [10]:
df=df.join(dummies_F1D1)

Due to the impending "group-by" operation, rename the FID1_xxx columns as "xxx_txn_cnt".

In [11]:
df=df.rename(columns={'FID1_HYG':'HYG_txn_cnt', 'FID1_IVV':'IVV_txn_cnt', 'FID1_N6M':'N6M_txn_cnt'})

In [12]:
df.columns

Index(['Unnamed: 0', 'Date', 'FID1', 'investor_id', 'Units', 'Gender',
       'income', 'education', 'riskscore', 'subscribed', 'opened', 'SCRate',
       'TRate', 'FRSC', 'Price', 'return-N6M', 'return-HYG', 'return-IVV',
       'HYG_txn_cnt', 'IVV_txn_cnt', 'N6M_txn_cnt'],
      dtype='object')

**Create additional field for transaction value, drop entries with 0 transaction value, convert date field to date type.**

In [13]:
df['Txn_Val']=df['Units']*df['Price']

In [14]:
df = df[df['Txn_Val'] != 0]

In [15]:
df['Date_type']=pd.to_datetime(df.Date)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6099 entries, 0 to 6996
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Unnamed: 0   6099 non-null   int64         
 1   Date         6099 non-null   object        
 2   FID1         6099 non-null   object        
 3   investor_id  6099 non-null   int64         
 4   Units        6099 non-null   float64       
 5   Gender       6099 non-null   object        
 6   income       6099 non-null   float64       
 7   education    6099 non-null   object        
 8   riskscore    6099 non-null   int64         
 9   subscribed   6099 non-null   bool          
 10  opened       6099 non-null   int64         
 11  SCRate       6099 non-null   float64       
 12  TRate        6099 non-null   float64       
 13  FRSC         6099 non-null   object        
 14  Price        6099 non-null   float64       
 15  return-N6M   6099 non-null   float64       
 16  return

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,FID1,investor_id,Units,Gender,income,education,riskscore,subscribed,opened,SCRate,TRate,FRSC,Price,return-N6M,return-HYG,return-IVV,HYG_txn_cnt,IVV_txn_cnt,N6M_txn_cnt,Txn_Val,Date_type
0,1,2019/8/25,HYG,39685,9.254125,M,32455.83539,H.Sch/Dip.,5,False,0,0.03,0.0,M,86.82,0.0,0.0,0.0,1,0,0,803.443139,2019-08-25
1,2,2019/8/25,IVV,39685,11.123784,M,32455.83539,H.Sch/Dip.,5,False,0,0.05,0.0,H,288.91,0.0,0.0,0.0,0,1,0,3213.772554,2019-08-25
2,3,2019/8/25,N6M,49107,151.569607,M,44717.69046,H.Sch/Dip.,4,True,0,0.03,0.0,L,11.14,0.0,0.0,0.0,0,0,1,1688.485426,2019-08-25
3,4,2019/8/25,HYG,49107,77.792464,M,44717.69046,H.Sch/Dip.,4,True,0,0.03,0.0,M,86.82,0.0,0.0,0.0,1,0,0,6753.941707,2019-08-25
4,5,2019/8/25,IVV,49107,29.221651,M,44717.69046,H.Sch/Dip.,4,True,0,0.05,0.0,H,288.91,0.0,0.0,0.0,0,1,0,8442.427136,2019-08-25


**Pick a date range and take a slice of the data**

[Note:] Since we are dealing with change in AUM, any transactions related to 2019/8/25 should be excluded from grouping.

In [145]:
# Define the data range used to slide the data for analysis/ model-fitting/ forecasting
min_date = '2020-03-12'
max_date = '2020-03-20'

In [146]:
df1 = df[(df['Date_type'] >= min_date) & (df['Date_type'] <= max_date)]

**Developing the response variables**

[Note] Different response variables (AUM per investor, AUM per transaction per investor, transaction frequency per investor, etc) will require a different grouping approach.

[Note 2] The codes for each response variable can be run separately. 

[Note 3] Refer to the business question that we are trying to answer via visualization.  Currently, these data cleaning script are using the full data set. That is good for business question #2 when we are trying to say, within the date range chosen by the user, how much of the variance in the response variable can be explained by the different Xs.  However, for the forecast question (biz question #4 in the slides), if we want to test on unseen data, then then "training set" (which will get further split between train vs validation during cross-validation) may need to cut of at an earlier date.  Would be useful, somewhere at the start of this script, to put in two input parameters for us to set the date range we want to run this data cleaning script for.

**Response variable 1) Change in AUM per investor**

**Apply "Groupby" based on investor_id**

In [147]:
keys=['investor_id']


In [148]:
grouped=df1.groupby(keys)
grouped.sum().head()

Unnamed: 0_level_0,Unnamed: 0,Units,income,riskscore,subscribed,opened,SCRate,TRate,Price,return-N6M,return-HYG,return-IVV,HYG_txn_cnt,IVV_txn_cnt,N6M_txn_cnt,Txn_Val
investor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10252,1968,17.085624,25580.11645,4,0.0,0,0.03,0.0,72.989998,0.0,-0.044133,-0.05568,1,0,0,1247.07966
11042,8481,0.0,118059.40506,10,0.0,0,0.1,0.0,471.279999,0.009925,-0.077391,-0.160773,0,2,0,11.843152
11236,2813,30.493289,124125.4999,5,0.0,0,0.05,0.0,240.119995,0.002804,-0.054966,-0.115743,0,1,0,7322.048361
11431,2471,1.753056,20472.16172,5,1.0,1,0.05,0.0,248.960007,-0.014493,-0.039955,-0.096236,0,1,0,436.440784
11770,2240,-5.090563,47994.40091,5,0.0,0,0.05,0.0,271.549988,-0.016544,0.031439,0.090737,0,1,0,-1382.342405


In [149]:
# Column index 15 has the transaction value
df_aggTxnval=(grouped.sum()).iloc[:,[15]]

In [150]:
df_aggTxnval.describe()

Unnamed: 0,Txn_Val
count,177.0
mean,2413.377596
std,11349.213259
min,-85910.63189
25%,-413.120306
50%,366.542783
75%,3077.197669
max,48779.401681


In [151]:
# Append 6 fields related to investor charateristics - gender, income, education, riskscore,
# subscribed, opened - to the response variable data frame.  Check that the row count remain unchanged after joining, or remove duplication to keep row count constant.
df_indv_feature=(grouped.max())
df_indv_feature.head()

Unnamed: 0_level_0,Unnamed: 0,Date,FID1,Units,Gender,income,education,riskscore,subscribed,opened,SCRate,TRate,FRSC,Price,return-N6M,return-HYG,return-IVV,HYG_txn_cnt,IVV_txn_cnt,N6M_txn_cnt,Txn_Val,Date_type
investor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10252,1968,2020/3/18,HYG,17.085624,F,25580.11645,Sec. or below,4,False,0,0.03,0.0,M,72.989998,0.0,-0.044133,-0.05568,1,0,0,1247.07966,2020-03-18
11042,4241,2020/3/20,IVV,1.321782,F,59029.70253,Deg or abv,5,False,0,0.05,0.0,H,240.119995,0.007121,-0.022425,-0.04503,0,1,0,317.3862,2020-03-20
11236,2813,2020/3/16,IVV,30.493289,M,124125.4999,H.Sch/Dip.,5,False,0,0.05,0.0,H,240.119995,0.002804,-0.054966,-0.115743,0,1,0,7322.048361,2020-03-16
11431,2471,2020/3/12,IVV,1.753056,F,20472.16172,Sec. or below,5,True,1,0.05,0.0,H,248.960007,-0.014493,-0.039955,-0.096236,0,1,0,436.440784,2020-03-12
11770,2240,2020/3/13,IVV,-5.090563,M,47994.40091,Deg or abv,5,False,0,0.05,0.0,H,271.549988,-0.016544,0.031439,0.090737,0,1,0,-1382.342405,2020-03-13


In [152]:
df_indv_feature.iloc[:,[4,5,6,7,8,9]]

Unnamed: 0_level_0,Gender,income,education,riskscore,subscribed,opened
investor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10252,F,25580.11645,Sec. or below,4,False,0
11042,F,59029.70253,Deg or abv,5,False,0
11236,M,124125.49990,H.Sch/Dip.,5,False,0
11431,F,20472.16172,Sec. or below,5,True,1
11770,M,47994.40091,Deg or abv,5,False,0
...,...,...,...,...,...,...
97452,M,70722.80585,Deg or abv,4,True,0
97481,M,230419.45410,Deg or abv,3,False,0
98243,M,132918.89590,Sec. or below,3,False,0
98809,M,55422.95073,Sec. or below,1,True,1


In [153]:
df_iid=pd.merge(df_aggTxnval, df_indv_feature.iloc[:,[4, 5,6,7,8,9]], on=['investor_id'])

# Check row count remains unchanged
df_iid.shape

(177, 7)

In [154]:
df_iid.columns

Index(['Txn_Val', 'Gender', 'income', 'education', 'riskscore', 'subscribed',
       'opened'],
      dtype='object')

In [155]:
# Append 2 fields - SCRate, TRate - that describes the average commission and trailer 
# fee rate exprienced by each investor for their respective transactions.
# Also append 3 fields about the average equity, bonds, and high-yields return on the days
# each investor made their transactions.

df_intm_feature=(grouped.mean())
df_intm_feature.head()

Unnamed: 0_level_0,Unnamed: 0,Units,income,riskscore,subscribed,opened,SCRate,TRate,Price,return-N6M,return-HYG,return-IVV,HYG_txn_cnt,IVV_txn_cnt,N6M_txn_cnt,Txn_Val
investor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10252,1968.0,17.085624,25580.11645,4.0,False,0.0,0.03,0.0,72.989998,0.0,-0.044133,-0.05568,1.0,0.0,0.0,1247.07966
11042,4240.5,0.0,59029.70253,5.0,False,0.0,0.05,0.0,235.64,0.004962,-0.038695,-0.080386,0.0,1.0,0.0,5.921576
11236,2813.0,30.493289,124125.4999,5.0,False,0.0,0.05,0.0,240.119995,0.002804,-0.054966,-0.115743,0.0,1.0,0.0,7322.048361
11431,2471.0,1.753056,20472.16172,5.0,True,1.0,0.05,0.0,248.960007,-0.014493,-0.039955,-0.096236,0.0,1.0,0.0,436.440784
11770,2240.0,-5.090563,47994.40091,5.0,False,0.0,0.05,0.0,271.549988,-0.016544,0.031439,0.090737,0.0,1.0,0.0,-1382.342405


In [156]:
df_intm_feature.iloc[:,[6,7,9,10,11]]

Unnamed: 0_level_0,SCRate,TRate,return-N6M,return-HYG,return-IVV
investor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10252,0.03,0.0,0.000000,-0.044133,-0.055680
11042,0.05,0.0,0.004962,-0.038695,-0.080386
11236,0.05,0.0,0.002804,-0.054966,-0.115743
11431,0.05,0.0,-0.014493,-0.039955,-0.096236
11770,0.05,0.0,-0.016544,0.031439,0.090737
...,...,...,...,...,...
97452,0.03,0.0,0.000000,-0.044133,-0.055680
97481,0.03,0.0,0.007121,-0.022425,-0.045030
98243,0.03,0.0,-0.016544,0.031439,0.090737
98809,0.03,0.0,-0.030159,0.004485,0.047213


In [157]:
df_sn1=pd.merge(df_iid, df_intm_feature.iloc[:,[6,7,9,10,11]], on=['investor_id'])

# Check row count remains unchanged
df_sn1.shape

(177, 12)

In [158]:
df_sn1.columns

Index(['Txn_Val', 'Gender', 'income', 'education', 'riskscore', 'subscribed',
       'opened', 'SCRate', 'TRate', 'return-N6M', 'return-HYG', 'return-IVV'],
      dtype='object')

In [159]:
df_sn1=df_sn1.join(pd.get_dummies(df_sn1['Gender'],prefix='gen'))

In [160]:
del df_sn1['Gender']

In [161]:
df_sn1=df_sn1.join(pd.get_dummies(df_sn1['education'],prefix='ed'))

In [162]:
del df_sn1['education']

In [163]:
df_sn1['subscribed']=df_sn1['subscribed'].apply(lambda x:1 if x else 0)

In [164]:
df_sn1.columns

Index(['Txn_Val', 'income', 'riskscore', 'subscribed', 'opened', 'SCRate',
       'TRate', 'return-N6M', 'return-HYG', 'return-IVV', 'gen_F', 'gen_M',
       'ed_Deg or abv', 'ed_H.Sch/Dip.', 'ed_Sec. or below'],
      dtype='object')

In [165]:
df_sn1

Unnamed: 0_level_0,Txn_Val,income,riskscore,subscribed,opened,SCRate,TRate,return-N6M,return-HYG,return-IVV,gen_F,gen_M,ed_Deg or abv,ed_H.Sch/Dip.,ed_Sec. or below
investor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10252,1.247080e+03,25580.11645,4,0,0,0.03,0.0,0.000000,-0.044133,-0.055680,1,0,0,0,1
11042,1.184315e+01,59029.70253,5,0,0,0.05,0.0,0.004962,-0.038695,-0.080386,1,0,1,0,0
11236,7.322048e+03,124125.49990,5,0,0,0.05,0.0,0.002804,-0.054966,-0.115743,0,1,0,1,0
11431,4.364408e+02,20472.16172,5,1,1,0.05,0.0,-0.014493,-0.039955,-0.096236,1,0,0,0,1
11770,-1.382342e+03,47994.40091,5,0,0,0.05,0.0,-0.016544,0.031439,0.090737,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97452,1.209855e-09,70722.80585,4,1,0,0.03,0.0,0.000000,-0.044133,-0.055680,0,1,1,0,0
97481,1.325631e+04,230419.45410,3,0,0,0.03,0.0,0.007121,-0.022425,-0.045030,0,1,1,0,0
98243,1.151204e+03,132918.89590,3,0,0,0.03,0.0,-0.016544,0.031439,0.090737,0,1,0,0,1
98809,6.177904e+03,55422.95073,1,1,1,0.03,0.0,-0.030159,0.004485,0.047213,0,1,0,0,1


**Export csv file for regression analysis**

Naming convention to the file which goes like this:

***"Txn_Record_[Response_variable_name]_[Date_range]_[Fund_Range].csv"***

Response_variable_name:  In this case, we can use 'deltaAUM'.

Date_range: Concatenate the start and end date of the slice of data.

Fund_range (optional):  At the moment, transaction for all funds are used.  In case we want to analyse a specific fund, filter for the relevant fund's transaction record during the earlier data preparation step and state that in the file name.

In [166]:
df_sn1 = df_sn1.reset_index(drop=True)

In [167]:
df_sn1

Unnamed: 0,Txn_Val,income,riskscore,subscribed,opened,SCRate,TRate,return-N6M,return-HYG,return-IVV,gen_F,gen_M,ed_Deg or abv,ed_H.Sch/Dip.,ed_Sec. or below
0,1.247080e+03,25580.11645,4,0,0,0.03,0.0,0.000000,-0.044133,-0.055680,1,0,0,0,1
1,1.184315e+01,59029.70253,5,0,0,0.05,0.0,0.004962,-0.038695,-0.080386,1,0,1,0,0
2,7.322048e+03,124125.49990,5,0,0,0.05,0.0,0.002804,-0.054966,-0.115743,0,1,0,1,0
3,4.364408e+02,20472.16172,5,1,1,0.05,0.0,-0.014493,-0.039955,-0.096236,1,0,0,0,1
4,-1.382342e+03,47994.40091,5,0,0,0.05,0.0,-0.016544,0.031439,0.090737,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,1.209855e-09,70722.80585,4,1,0,0.03,0.0,0.000000,-0.044133,-0.055680,0,1,1,0,0
173,1.325631e+04,230419.45410,3,0,0,0.03,0.0,0.007121,-0.022425,-0.045030,0,1,1,0,0
174,1.151204e+03,132918.89590,3,0,0,0.03,0.0,-0.016544,0.031439,0.090737,0,1,0,0,1
175,6.177904e+03,55422.95073,1,1,1,0.03,0.0,-0.030159,0.004485,0.047213,0,1,0,0,1


In [168]:
df_sn1=df_sn1.rename(columns={'Txn_Val':'TotalTransactions'})

In [169]:
df_sn1.to_csv(f'Txn_Record_deltaAUM_{min_date}_{max_date}.csv')

In [142]:
# As a by-product, one may focus on investors who have net redemption over the investigation period
# This help to analyse AUM attrition.
df_sn1a = df_sn1[df_sn1['TotalTransactions']<0]
df_sn1a['TotalTransactions'] = -df_sn1a['TotalTransactions']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [143]:
df_sn1a.shape

(195, 15)

In [144]:
df_sn1a.to_csv(f'Txn_Record_AUMattrn_{min_date}_{max_date}.csv')



**Response variable 2) Change in AUM per transaction per investor**

**Apply "Groupby" based on investor_id, calculate average transaction size**

In [59]:
keys=['investor_id']
grouped=df1.groupby(keys)
# Column index 15 has the transaction value in .sum(). In .count, it is index 20
df_avgTxnval=(grouped.sum()).iloc[:,[15]] / (grouped.count()).iloc[:,[20]]
df_avgTxnval.head()

Unnamed: 0_level_0,Txn_Val
investor_id,Unnamed: 1_level_1
10051,4694.78508
10122,1145.401103
10252,2064.84665
10410,8810.585278
10487,475.393381


In [60]:
# Append other fields as in the process for response variable 1
df_indv_feature=(grouped.max())
df_indv_feature.iloc[:,[4,5,6,7,8,9]]
df_iid=pd.merge(df_avgTxnval, df_indv_feature.iloc[:,[4,5,6,7,8,9]], on=['investor_id'])
df_intm_feature=(grouped.mean())
df_intm_feature.iloc[:,[6,7,9,10,11]]
df_sn2=pd.merge(df_iid, df_intm_feature.iloc[:,[6,7,9,10,11]], on=['investor_id'])
df_sn2=df_sn2.join(pd.get_dummies(df_sn2['Gender'],prefix='gen'))
del df_sn2['Gender']
df_sn2=df_sn2.join(pd.get_dummies(df_sn2['education'],prefix='ed'))
del df_sn2['education']
df_sn2['subscribed']=df_sn2['subscribed'].apply(lambda x:1 if x else 0)

In [61]:
df_sn2.shape

(821, 15)

In [62]:
df_sn2.columns

Index(['Txn_Val', 'income', 'riskscore', 'subscribed', 'opened', 'SCRate',
       'TRate', 'return-N6M', 'return-HYG', 'return-IVV', 'gen_F', 'gen_M',
       'ed_Deg or abv', 'ed_H.Sch/Dip.', 'ed_Sec. or below'],
      dtype='object')

In [63]:
df_sn2.head()

Unnamed: 0_level_0,Txn_Val,income,riskscore,subscribed,opened,SCRate,TRate,return-N6M,return-HYG,return-IVV,gen_F,gen_M,ed_Deg or abv,ed_H.Sch/Dip.,ed_Sec. or below
investor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10051,4694.78508,324465.7233,3,0,0,0.03,0.0,-0.002657,0.001643,0.019738,0,1,1,0,0
10122,1145.401103,40206.16338,4,0,0,0.03,0.0,0.0,0.001382,0.008922,0,1,1,0,0
10252,2064.84665,25580.11645,4,0,0,0.03,0.0,0.0,-0.012972,-0.014285,1,0,0,0,1
10410,8810.585278,110118.9425,5,0,0,0.05,0.0,-0.000908,0.002416,-0.001045,0,1,1,0,0
10487,475.393381,152825.5386,5,0,0,0.042,0.0,-0.00126,-0.003018,-0.003583,0,1,1,0,0


In [64]:
# Export to csv
df_sn2 = df_sn2.reset_index(drop=True)
df_sn2 = df_sn2.rename(columns={'Txn_Val':'TotalTransactions'})
df_sn2.to_csv(f'Txn_Record_avgTxnSize_{min_date}_{max_date}.csv')

**Response variable 3) Transaction frequency per investor**

**Apply "Groupby" based on investor_id, count trades**

In [78]:
keys=['investor_id']
grouped=df1.groupby(keys)
# In .count, it is transaction value is in index 20
df_Txncnt = (grouped.count()).iloc[:,[20]]
df_Txncnt.head()

Unnamed: 0_level_0,Txn_Val
investor_id,Unnamed: 1_level_1
10076,2
10410,4
10487,4
11042,1
11236,5


In [79]:
# Append other fields as in the process for response variable 1
df_indv_feature=(grouped.max())
df_indv_feature.iloc[:,[4,5,6,7,8,9]]
df_iid=pd.merge(df_Txncnt, df_indv_feature.iloc[:,[4,5,6,7,8,9]], on=['investor_id'])
df_intm_feature=(grouped.mean())
df_intm_feature.iloc[:,[6,7,9,10,11]]
df_sn3=pd.merge(df_iid, df_intm_feature.iloc[:,[6,7,9,10,11]], on=['investor_id'])
df_sn3=df_sn3.join(pd.get_dummies(df_sn3['Gender'],prefix='gen'))
del df_sn3['Gender']
df_sn3=df_sn3.join(pd.get_dummies(df_sn3['education'],prefix='ed'))
del df_sn3['education']
df_sn3['subscribed']=df_sn3['subscribed'].apply(lambda x:1 if x else 0)

In [80]:
df_sn3.shape

(717, 15)

In [81]:
df_sn3.columns

Index(['Txn_Val', 'income', 'riskscore', 'subscribed', 'opened', 'SCRate',
       'TRate', 'return-N6M', 'return-HYG', 'return-IVV', 'gen_F', 'gen_M',
       'ed_Deg or abv', 'ed_H.Sch/Dip.', 'ed_Sec. or below'],
      dtype='object')

In [82]:
df_sn3.head()

Unnamed: 0_level_0,Txn_Val,income,riskscore,subscribed,opened,SCRate,TRate,return-N6M,return-HYG,return-IVV,gen_F,gen_M,ed_Deg or abv,ed_H.Sch/Dip.,ed_Sec. or below
investor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10076,2,87671.4757,4,1,1,0.04,0.0,0.006434,-0.003367,-0.006097,1,0,1,0,0
10410,4,110118.9425,5,0,0,0.04,0.0,0.000464,0.03455,0.009948,0,1,1,0,0
10487,4,152825.5386,5,0,0,0.045,0.0,0.0,-0.010846,-0.005419,0,1,1,0,0
11042,1,59029.70253,5,0,0,0.05,0.0,-0.007583,-0.003423,0.011963,1,0,1,0,0
11236,5,124125.4999,5,0,0,0.042,0.0,-0.00114,0.005093,0.007077,0,1,0,1,0


In [83]:
# Export to csv
df_sn3 = df_sn3.reset_index(drop=True)
df_sn3 = df_sn3.rename(columns={'Txn_Val':'TotalTransactions'})
df_sn3.to_csv(f'Txn_Record_TxnFreq_{min_date}_{max_date}.csv')