# PHASE 1: PROFILE + GEN Q'S

In [1]:
import utils.csv_utils as csv_utils 
import utils.dir_utils as dir_utils
import utils.dict_utils as dict_utils 
import utils.ptr_utils as ptr_utils
import utils.constants as constants 
import helpers.official as official
import helpers.search as search
import helpers.congress as congress
import pandas as pd 

In [2]:
_, input_df = dir_utils.get_data(combined=True)
_, house_input_df = dir_utils.get_data(house=True)
_, senate_input_df = dir_utils.get_data(senate=True)

num_of_transactions = input_df.shape[0]
num_of_house_transactions = house_input_df.shape[0]
num_of_senate_transactions = senate_input_df.shape[0]

sector_df = dir_utils.get_mapping(sector=True)
industry_df = dir_utils.get_mapping(industry=True)

# {canonical_name_input_based : link, ...}
input_all_officials_name = {}

# {link : canonical_name_input_based, ....}
input_all_officials_link = {}
input_house_officials_link = {}
input_senate_officials_link = {}

# (canonical_name_input_based, ...)
names = set()

for _,t in input_df.iterrows():        
    name = official.get_name(t)
        
    if name not in names:    
        link = search.get_wiki_link(name)
                
        if ptr_utils.isvalid(t[constants.REPRESENTATIVE]) and link not in input_house_officials_link:
            input_house_officials_link =  dict_utils.increment_dictionary(input_house_officials_link, link, name, not_math=True)
        if ptr_utils.isvalid(t[constants.SENATOR]) and link not in input_senate_officials_link:
            input_senate_officials_link =  dict_utils.increment_dictionary(input_senate_officials_link, link, name, not_math=True)
        
        input_all_officials_link =  dict_utils.increment_dictionary(input_all_officials_link, link, name, not_math=True)
        input_all_officials_name =  dict_utils.increment_dictionary(input_all_officials_name, name, link, not_math=True)

        names.add(name)

input_officials_in_house_and_senate = 0 
for link in input_all_officials_link:
    if link in input_house_officials_link and link in input_senate_officials_link:
        del input_house_officials_link[link]
        input_officials_in_house_and_senate += 1
print("Number of officials in House and Senate in total (from 112-117th congress): {}\n".format(input_officials_in_house_and_senate))


print("Number of transactions: {} \n".format(ptr_utils.commify_str(len(input_df.index))))

print("Number of transactions by House Representatives: {}, {}".format(ptr_utils.commify_str(num_of_house_transactions), ptr_utils.make_percent(num_of_house_transactions, len(input_df.index))))
print("Number of transactions by House Representatives controlled: {0:.2f} transactions per representative \n".format((num_of_house_transactions / len(input_house_officials_link))))

print("Number of transactions by Senators: {}, {}".format(ptr_utils.commify_str(num_of_senate_transactions), ptr_utils.make_percent(num_of_senate_transactions, len(input_df.index))))
print("Number of transactions by Senators controlled: {0:.2f} transactions per senator \n".format( (num_of_senate_transactions /  len(input_senate_officials_link))))

# {link : (canonical_name_input_based, official_object), ... }
input_house_officials_objects = {}
for link, person in input_house_officials_link.items(): 
    off = search.wiki_search(person)        
    input_house_officials_objects[link] = (person, off)
        
# {link : (canonical_name_input_based, official_object), ... }
input_senate_officials_objects = {}
for link, person in input_senate_officials_link.items():
    off = search.wiki_search(person)        
    input_senate_officials_objects[link] = (person, off)

# {link : (canonical_name_input_based, official_object) ... }
input_officials_objects = {**input_house_officials_objects, **input_senate_officials_objects}

# {link : canonical_name_wiki_based, ... }
all_officials = congress.get_all_officials()
house_officials = congress.get_house_officials()
senate_officials = congress.get_senate_officials()

# {link : gender, ...}
all_officials_gender = {}
input_house_officials_gender = {}
input_senate_officials_gender = {}

all_officials_party_and_gender = {}
house_officials_party_and_gender = {}
senate_officials_party_and_gender = {}

for link, name in all_officials.items():
    gender = official.get_gender(name, link)
    
    x = search.congress_gov_get(name, party_only=True)
    grouped = x + ", " + gender
        
    all_officials_party_and_gender = dict_utils.increment_dictionary(all_officials_party_and_gender, grouped)

    if link in senate_officials:
        senate_officials_party_and_gender = dict_utils.increment_dictionary(senate_officials_party_and_gender, grouped)
    if link in house_officials:
        house_officials_party_and_gender = dict_utils.increment_dictionary(house_officials_party_and_gender, grouped)
  
    if link in input_house_officials_objects:
       input_house_officials_gender[link] = gender
    if link in input_senate_officials_objects:
        input_senate_officials_gender[link] =  gender
    all_officials_gender[link] = gender

# {'California' :  #_of_representatives_from_112_to_117, ...}
all_officials_state_count = congress.get_officials_state(everyone=list(all_officials.values()))
house_officials_state_count = congress.get_officials_state(house=list(house_officials.values()))
senate_officials_state_count = congress.get_officials_state(everyone=list(senate_officials.values()))

congress_objects = []
house_officials_party = {}
senate_officials_party = {}
for i in range(112, 118):
    c = search.get_congress(i)
    congress_objects.append(c)
    house_officials_party.update(c.get_house_party())
    senate_officials_party.update(c.get_senate_party())
all_officials_party = {**house_officials_party, **senate_officials_party}
    
# {link : canonical_name_wiki_based, ... }
all_officials_not_in_input = dict(all_officials)
house_officials_not_in_input = dict(house_officials)
senate_officials_not_in_input = dict(senate_officials)

all_officials_in_house_and_senate = 0 
for link in all_officials_not_in_input:
    if link in house_officials_not_in_input and link in senate_officials_not_in_input:
        del house_officials_not_in_input[link]
        all_officials_in_house_and_senate += 1
print("Number of officials in House and Senate in total (from 112-117th congress): {}\n".format(all_officials_in_house_and_senate))


for link_input in input_all_officials_link.keys():
    del all_officials_not_in_input[link_input]
    
    if link_input in house_officials_not_in_input:
        del house_officials_not_in_input[link_input]
    else:
        del senate_officials_not_in_input[link_input]
    

print("Number of officials in input: {}".format(len(input_all_officials_link)))
print("Number of officials in input controlled: {}\n".format(ptr_utils.make_percent(len(input_all_officials_link), len(all_officials))))

print("Number of representatives in input: {}, {}".format(len(input_house_officials_link), ptr_utils.make_percent(len(input_house_officials_link), len(input_all_officials_link))))
print("Number of representatives in input controlled: {} \n".format(ptr_utils.make_percent(len(input_house_officials_link), len(house_officials))))

print("Number of senators in input: {}, {}".format(len(input_senate_officials_link), ptr_utils.make_percent(len(input_senate_officials_link), len(input_all_officials_link))))
print("Number of senators in input controlled: {} \n".format(ptr_utils.make_percent(len(input_senate_officials_link), len(senate_officials))))

print("Number of officials in total (from 112-117th congress): {}".format(ptr_utils.commify_str(len(all_officials))))
print("Number of representatives in total (from 112-117th congress): {}".format(ptr_utils.commify_str(len(house_officials))))
print("Number of senators in total (from 112-117th congress): {}\n".format(ptr_utils.commify_str(len(senate_officials))))

print("Number of officials from 112-117th congress who DID NOT engage in the market: {}, {}\n".format(len(all_officials_not_in_input), ptr_utils.make_percent(len(all_officials_not_in_input), len(all_officials) ) ))
print("Number of representatives from 112-117th congress who DID NOT engage in the market: {}".format(len(house_officials_not_in_input)))
print("Number of senators from 112-117th congress who DID NOT engage in the market: {}\n".format(len(senate_officials_not_in_input)))

print("Number of officials from 112-117th congress who DID engage in the market: {}, {} \n".format(len(input_all_officials_link), ptr_utils.make_percent(len(input_all_officials_link), len(all_officials) ) ))

def t_to_obj(t):
    name = official.get_name(t)
    link = input_all_officials_name[name]
    _, obj = input_officials_objects[link]
    return obj

Number of officials in House and Senate in total (from 112-117th congress): 0

Number of transactions: 22,195 

Number of transactions by House Representatives: 13,154, 59.27%
Number of transactions by House Representatives controlled: 83.78 transactions per representative 

Number of transactions by Senators: 9,041, 40.73%
Number of transactions by Senators controlled: 143.51 transactions per senator 

Number of officials in House and Senate in total (from 112-117th congress): 27

Number of officials in input: 220
Number of officials in input controlled: 22.70%

Number of representatives in input: 157, 71.36%
Number of representatives in input controlled: 18.87% 

Number of senators in input: 63, 28.64%
Number of senators in input controlled: 38.41% 

Number of officials in total (from 112-117th congress): 969
Number of representatives in total (from 112-117th congress): 832
Number of senators in total (from 112-117th congress): 164

Number of officials from 112-117th congress who DID

## Profile 

### Party and Gender

In [None]:
def profile_party_and_gender(group, normalized=None):
    # {party_and_gender : 45_ppl}
    d = {}
        
    # {link : (canonical_name_input_based, official_object), ... 
    for link, (_,obj) in group.items():
        x = obj.party
        gender = all_officials_gender[link]
        grouped = x + ", " + gender
        d = dict_utils.increment_dictionary(d, grouped)

    # Normalize
    if normalized: 
        d = dict_utils.normalize(d, normalized, percent=True)

    return d

d1 = profile_party_and_gender(input_house_officials_objects, house_officials_party_and_gender)
d2 = profile_party_and_gender(input_senate_officials_objects, senate_officials_party_and_gender)
d3 = profile_party_and_gender(input_officials_objects, all_officials_party_and_gender)

dir = dir_utils.makesubdir(constants.path_csv, "profile/partyandgender")
csv_utils.make_csv_multiple_dicts(dir, "profile_party_and_gender_normalized", (d1,d2,d3), [constants.PARTY + ", " + constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])


d1 = profile_party_and_gender(input_house_officials_objects)
d2 = profile_party_and_gender(input_senate_officials_objects)
d3 = profile_party_and_gender(input_officials_objects)

dir = dir_utils.makesubdir(constants.path_csv, "profile/partyandgender")
csv_utils.make_csv_multiple_dicts(dir, "profile_party_and_gender", (d1,d2,d3), [constants.PARTY + ", " + constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])

In [4]:
def profile_active_party_and_gender(group, normalized=None):
    # {party_and_gender : 45_transactions}
    d_number = {}
    # {party_and_gender : [gmean, gmean, ...]}
    d_size = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]):
            name = official.get_name(t)
            link = input_all_officials_name[name]

            gender = all_officials_gender[link]
            obj = t_to_obj(t)
            x = obj.party
            grouped = x + ", " + gender

            
            d_number =  dict_utils.increment_dictionary(d_number, grouped)
            
            mean =  ptr_utils.get_gmean(t[constants.AMOUNT])             
            d_size = dict_utils.increment_list_in_dictionary(d_size, grouped, mean)

    d_size = dict_utils.flatten_gmean(d_size)

    # Normalize
    if normalized: 
        d_number = dict_utils.normalize(d_number, normalized, percent=True)
        d_number = dict_utils.sort_dictionary_by_tuple(d_number)
        
        d_size = dict_utils.normalize(d_size, normalized)
        d_size = dict_utils.sort_dictionary_by_tuple(d_size)

    return d_number, d_size
    
d1,d4 = profile_active_party_and_gender(house_input_df, house_officials_party_and_gender)
d2,d5 = profile_active_party_and_gender(senate_input_df, senate_officials_party_and_gender)
d3,d6 = profile_active_party_and_gender(input_df, all_officials_party_and_gender)

dir = dir_utils.makesubdir(constants.path_csv, "profile/partyandgender")
csv_utils.make_csv_multiple_dicts(dir, "profile_active_party_and_gender_number_normalized", (d1,d2,d3), [constants.PARTY + ", " + constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_party_and_gender_size_normalized", (d4,d5,d6), [constants.PARTY + ", " + constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])


d1,d4 = profile_active_party_and_gender(house_input_df)
d2,d5 = profile_active_party_and_gender(senate_input_df)
d3,d6 = profile_active_party_and_gender(input_df)
csv_utils.make_csv_multiple_dicts(dir, "profile_active_party_and_gender_number", (d1,d2,d3), [constants.PARTY + ", " + constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_party_and_gender_size", (d4,d5,d6), [constants.PARTY + ", " + constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])

### Age (Lowest, Highest, Average)

In [5]:
def profile_age(group):
    # {age, age, ...}
    l = []
    
    # group = {link : (canonical_name_input_based, official_object) ... }
    for _, (_, off_obj) in group.items(): 
        l.append(off_obj.get_age())
    
    l.sort()
    
    d = {}
    d["0. Youngest"] = l[0]
    d["1. Average"] = round((sum(l) / len(l)))
    d["2. Oldest"] = l[len(l)-1]
    
    return d 
    
d1 = profile_age(input_house_officials_objects)
d2 = profile_age(input_senate_officials_objects)
d3 = profile_age(input_officials_objects)

dir = dir_utils.makesubdir(constants.path_csv, "profile/age")
csv_utils.make_csv_multiple_dicts(dir, "profile_age", (d1,d2,d3), ["Age", constants.HOUSE, constants.SENATE, constants.INPUT])

def profile_age_2(group):
    # {age : #_of_people, ...}
    d = dict(constants.age_groups)
    
    for (_, off_obj) in group.values(): 
        age = off_obj.get_age()
        d = dict_utils.increment_dictionary(d, ptr_utils.which_age_group(age)) 
    
    return d 
    
d1 = profile_age_2(input_house_officials_objects)
d2 = profile_age_2(input_senate_officials_objects)
d3 = profile_age_2(input_officials_objects)

csv_utils.make_csv_multiple_dicts(dir, "profile_age_2", (d1,d2,d3), ["Age", constants.HOUSE, constants.SENATE, constants.INPUT])

#### Age (Which age is most active?) Active = No. of Trades &  Size of Transactions. 

In [1]:
def profile_active_age(group, normalized=None):
    # {age : 45_transactions}
    d_number = dict(constants.age_groups)
    
    # {age : [gmean, gmean, ...]}
    d_size = dict(constants.age_groups)
    for k,_ in d_size.items():
        d_size[k] = []
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]):
            obj = t_to_obj(t)
            age = obj.get_age()
            
            age_group = ptr_utils.which_age_group(age)
            d_number =  dict_utils.increment_dictionary(d_number, age_group)
            
            mean =  ptr_utils.get_gmean(t[constants.AMOUNT])             
            d_size = dict_utils.increment_list_in_dictionary(d_size, age_group, mean)

    d_size = dict_utils.flatten_gmean(d_size)

    # Normalize
    if normalized: 
        d_number = dict_utils.normalize(d_number, normalized, percent=True)
        d_number = dict_utils.sort_dictionary_by_tuple(d_number)
        
        d_size = dict_utils.normalize(d_size, normalized, percent=True)
        d_size = dict_utils.sort_dictionary_by_tuple(d_size)

    return d_number, d_size
    
d1,d4 = profile_active_age(house_input_df, profile_age_2(input_house_officials_objects))
d2,d5 = profile_active_age(senate_input_df, profile_age_2(input_senate_officials_objects))
d3,d6 = profile_active_age(input_df, profile_age_2(input_officials_objects))

dir = dir_utils.makesubdir(constants.path_csv, "profile/age")
csv_utils.make_csv_multiple_dicts(dir, "profile_age_active_number_normalized", (d1,d2,d3), [constants.AGE, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_age_active_size_normalized", (d4,d5,d6), [constants.AGE, constants.HOUSE, constants.SENATE, constants.INPUT])

    
d1,d4 = profile_active_age(house_input_df)
d2,d5 = profile_active_age(senate_input_df)
d3,d6 = profile_active_age(input_df)

csv_utils.make_csv_multiple_dicts(dir, "profile_age_active_number", (d1,d2,d3), [constants.AGE, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_age_active_size", (d4,d5,d6), [constants.AGE, constants.HOUSE, constants.SENATE, constants.INPUT])

NameError: name 'house_input_df' is not defined

### Oldest and Most Recent Dates (transaction and disclosure)

In [7]:
def profile_dates(group, type):
    lowest_tdate = lowest_ddate = highest_tdate = highest_ddate = None
    lowest_tdate_obj = lowest_ddate_obj = highest_tdate_obj = highest_ddate_obj = None
    
    for _,t in group.iterrows():
        curr = t[constants.TDATE]            
        if not lowest_tdate or curr < lowest_tdate:
            lowest_tdate = curr 
            lowest_tdate_obj = t
        if not highest_tdate or curr > highest_tdate:
            highest_tdate = curr  
            highest_tdate_obj = t

        curr = t[constants.DDATE]
        if not lowest_ddate or curr < lowest_ddate:
            lowest_ddate = curr 
            lowest_ddate_obj = t
        if not highest_ddate or curr > highest_ddate:
            highest_ddate = curr 
            highest_ddate_obj = t

    print("Oldest transaction_date for {}: {} \n {}".format(type, lowest_tdate, lowest_tdate_obj[constants.PTR_LINK]))
    print("Most recent transaction_date for {}: {} \n {} ".format(type, highest_tdate, highest_tdate_obj[constants.PTR_LINK]))
    
    print("Oldest disclosure_date for {}: {} \n {}".format(type, lowest_ddate, lowest_ddate_obj[constants.PTR_LINK]))
    print("Most recent disclosure_date for {}: {} \n {}\n".format(type, highest_ddate, highest_ddate_obj[constants.PTR_LINK]))
    

profile_dates(house_input_df, constants.HOUSE)
profile_dates(senate_input_df, constants.SENATE)
profile_dates(input_df, constants.INPUT)

Oldest transaction_date for house: 2018/09/08 
 https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2021/8218371.pdf
Most recent transaction_date for house: 2021/12/31 
 https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020182.pdf 
Oldest disclosure_date for house: 2020/01/02 
 https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2020/20013832.pdf
Most recent disclosure_date for house: 2022/02/11 
 https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/2022/20020423.pdf

Oldest transaction_date for senate: 2012/06/14 
 https://efdsearch.senate.gov/search/view/ptr/86e969b3-64e7-4a51-84d7-da82847b501e/
Most recent transaction_date for senate: 2021/12/31 
 https://efdsearch.senate.gov/search/view/ptr/41868f55-ad42-4855-9aca-1764a05fb956/ 
Oldest disclosure_date for senate: 2012/07/25 
 https://efdsearch.senate.gov/search/view/paper/CDFDAF62-18EA-4298-B0C5-62085A6EC3CD/
Most recent disclosure_date for senate: 2022/01/21 
 https://efdsearch.senate.gov/search/view/pt

### Gender

In [8]:
def profile_gender(group, normalized=None):
    # d_prime = {'Female' : set(Officials), 'Male' : set(Officials), ...}
    d_prime = {}

    for link, name in group.items(): 
        gender = all_officials_gender[link]
        d_prime = dict_utils.increment_set_in_dictionary(d_prime, gender, name)

    # d = {'Female' : #_of_officials, 'Male' : #_of_officials, ...}
    d = dict_utils.flatten_len(d_prime, inner_set=True)
    
    if normalized:
        d = dict_utils.normalize(d, normalized, percent=True)
        d = dict_utils.sort_dictionary_by_tuple(d)
                 
    return d

d1 = profile_gender(input_house_officials_link, profile_gender(house_officials))
d2 = profile_gender(input_senate_officials_link, profile_gender(senate_officials))
d3 = profile_gender(input_all_officials_link, profile_gender(all_officials))

dir = dir_utils.makesubdir(constants.path_csv, "profile/gender")
csv_utils.make_csv_multiple_dicts(dir, "profile_gender_normalized", (d1,d2,d3), [constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])


d1 = profile_gender(input_house_officials_link)
d2 = profile_gender(input_senate_officials_link)
d3 = profile_gender(input_all_officials_link)

csv_utils.make_csv_multiple_dicts(dir, "profile_gender", (d1,d2,d3), [constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])

#### Gender (Which gender is the most active?) Active = No. of Trades &  Size of Transactions. 

In [10]:
def profile_active_gender(group, normalized=None):
    # {'gender' : 5_trades, ...}
    d_number = {}
    
    # {gender : [gmean of amount, gmean of amount....] }
    d_size = {}

    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]):
            name = official.get_name(t)
            link = input_all_officials_name[name]
            g = all_officials_gender[link]
            
            d_number =  dict_utils.increment_dictionary(d_number, g)
            
            mean =  ptr_utils.get_gmean(t[constants.AMOUNT])             
            d_size = dict_utils.increment_list_in_dictionary(d_size, g, mean)

    d_size = dict_utils.flatten_gmean(d_size)
        
    # Normalize
    if normalized:
        d_number = dict_utils.normalize(d_number, normalized, percent=True)
        d_number = dict_utils.sort_dictionary_by_tuple(d_number)
        
        d_size = dict_utils.normalize(d_size, normalized, percent=True)
        d_size = dict_utils.sort_dictionary_by_tuple(d_size)

    return d_number, d_size


d1,d4 = profile_active_gender(house_input_df, profile_gender(house_officials))
d2,d5 = profile_active_gender(senate_input_df, profile_gender(senate_officials))
d3,d6 = profile_active_gender(input_df, profile_gender(all_officials))

dir = dir_utils.makesubdir(constants.path_csv, "profile/gender")
csv_utils.make_csv_multiple_dicts(dir, "profile_active_gender_number_normalized", (d1,d2,d3), [constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_gender_size_normalized", (d4,d5,d6), [constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])


d1,d4 = profile_active_gender(house_input_df)
d2,d5 = profile_active_gender(senate_input_df)
d3,d6 = profile_active_gender(input_df)

csv_utils.make_csv_multiple_dicts(dir, "profile_active_gender_number", (d1,d2,d3), [constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_gender_size", (d4,d5,d6), [constants.GENDER, constants.HOUSE, constants.SENATE, constants.INPUT])

### Party

In [11]:
def profile_party(group, normalized=None):
    # d_prime = {'Republican' : set(Officials), 'Democrat' : set(Officials), ...}
    d_prime = {}
    
    for (_, off_obj) in group.values(): 
        d_prime = dict_utils.increment_set_in_dictionary(d_prime, off_obj.party, off_obj.name)
        
    # d = {'Republican' : #_of_officials, 'Democrat' : #_of_officials, ...}
    d = dict_utils.flatten_len(d_prime, inner_set=True)
    
    if normalized: 
        d = dict_utils.normalize(d, normalized, percent=True)
        d = dict_utils.sort_dictionary_by_tuple(d)

    return d

# {link : (canonical_name_input_based, official_object), ... }
d1 = profile_party(input_house_officials_objects, house_officials_party)
d2 = profile_party(input_senate_officials_objects, senate_officials_party)
# {link : (canonical_name_input_based, official_object), ... }
d3 = profile_party(input_officials_objects, all_officials_party)

dir = dir_utils.makesubdir(constants.path_csv, "profile/party")
csv_utils.make_csv_multiple_dicts(dir, "profile_party_normalized", (d1,d2,d3), [constants.PARTY, constants.HOUSE, constants.SENATE, constants.INPUT])

# {link : (canonical_name_input_based, official_object), ... }
d1 = profile_party(input_house_officials_objects)
d2 = profile_party(input_senate_officials_objects)
# {link : (canonical_name_input_based, official_object), ... }
d3 = profile_party(input_officials_objects)

csv_utils.make_csv_multiple_dicts(dir, "profile_party", (d1,d2,d3), [constants.PARTY, constants.HOUSE, constants.SENATE, constants.INPUT])

#### Party (Which party is the most active?) Active = No. of Trades &  Size of Transactions. 

In [12]:
def profile_active_party(group, normalized=None):
    # {'party' : 5_trades, ...}
    d_number = {}
    
    # {party : [gmean of amount, gmean of amount....] }
    d_size = {}

    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]):
            obj = t_to_obj(t)
            party = obj.party        
            
            d_number =  dict_utils.increment_dictionary(d_number, party)
            
            mean =  ptr_utils.get_gmean(t[constants.AMOUNT])             
            d_size = dict_utils.increment_list_in_dictionary(d_size, party, mean)

    d_size = dict_utils.flatten_gmean(d_size)
        
    # Normalize
    if normalized:
        d_number = dict_utils.normalize(d_number, normalized, percent=True)
        d_number = dict_utils.sort_dictionary_by_tuple(d_number)
        
        d_size = dict_utils.normalize(d_size, normalized, percent=True)
        d_size = dict_utils.sort_dictionary_by_tuple(d_size)
    
    return d_number, d_size

d1,d4 = profile_active_party(house_input_df, house_officials_party)
d2,d5 = profile_active_party(senate_input_df, senate_officials_party)
d3,d6 = profile_active_party(input_df, all_officials_party)

dir = dir_utils.makesubdir(constants.path_csv, "profile/party")
csv_utils.make_csv_multiple_dicts(dir, "profile_active_party_number_normalized", (d1,d2,d3), [constants.PARTY, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_party_size_normalized", (d4,d5,d6), [constants.PARTY, constants.HOUSE, constants.SENATE, constants.INPUT])


d1,d4 = profile_active_party(house_input_df)
d2,d5 = profile_active_party(senate_input_df)
d3,d6 = profile_active_party(input_df)

csv_utils.make_csv_multiple_dicts(dir, "profile_active_party_number", (d1,d2,d3), [constants.PARTY, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_party_size", (d4,d5,d6), [constants.PARTY, constants.HOUSE, constants.SENATE, constants.INPUT])

### State

In [13]:
def profile_state(group, normalized=None):
    # d_prime = {'Maryland' : set(Officials), 'California' : set(Officials), ...}
    d_prime = {}
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for _, off_obj in group.values(): 
        d_prime = dict_utils.increment_set_in_dictionary(d_prime, off_obj.state, off_obj.name)

    # d = {'Maryland' : #_of_officials, 'California' : #_of_officials, ...}
    d = dict_utils.flatten_len(d_prime, inner_set=True)
    
    if normalized:
        d = dict_utils.normalize(d, normalized, percent=True)
        d = dict_utils.sort_dictionary_by_tuple(d)
            
    return d 

# {link : (canonical_name_input_based, official_object), ... }
d1 = profile_state(input_house_officials_objects, house_officials_state_count)
d2 = profile_state(input_senate_officials_objects, senate_officials_state_count)

# {link : (canonical_name_input_based, official_object) ... }
d3 = profile_state(input_officials_objects, all_officials_state_count)

dir = dir_utils.makesubdir(constants.path_csv, "profile/state")
csv_utils.make_csv_multiple_dicts(dir, "profile_state_normalized", (d1,d2,d3), [constants.STATE, constants.HOUSE, constants.SENATE, constants.INPUT])

# {link : (canonical_name_input_based, official_object), ... }
d1 = profile_state(input_house_officials_objects)
d2 = profile_state(input_senate_officials_objects)

# {link : (canonical_name_input_based, official_object) ... }
d3 = profile_state(input_officials_objects)

csv_utils.make_csv_multiple_dicts(dir, "profile_state", (d1,d2,d3), [constants.STATE, constants.HOUSE, constants.SENATE, constants.INPUT])

#### State (Which state is the most active?) Active = No. of Trades &  Size of Transactions. 

In [14]:
def profile_active_state(group, normalized=None):
    # {'state' : 5_trades, ...}
    d_number = {}
    
    # {state : [gmean of amount, gmean of amount....] }
    d_size = {}

    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]):
            obj =  t_to_obj(t)
            
            d_number =  dict_utils.increment_dictionary(d_number, obj.state)
            
            mean =  ptr_utils.get_gmean(t[constants.AMOUNT])             
            d_size = dict_utils.increment_list_in_dictionary(d_size, obj.state, mean)

    d_size = dict_utils.flatten_gmean(d_size)
    
    if normalized:
        d_number = dict_utils.normalize(d_number, normalized, percent=True)
        d_number = dict_utils.sort_dictionary_by_tuple(d_number)
        
        d_size = dict_utils.normalize(d_size, normalized, percent=True)
        d_size = dict_utils.sort_dictionary_by_tuple(d_size)

    return d_number, d_size

d1,d4 = profile_active_state(house_input_df, house_officials_state_count)
d2,d5 = profile_active_state(senate_input_df, senate_officials_state_count)
d3,d6 = profile_active_state(input_df, all_officials_state_count)

dir = dir_utils.makesubdir(constants.path_csv, "profile/state")
csv_utils.make_csv_multiple_dicts(dir, "profile_active_state_number_normalized", (d1,d2,d3), [constants.STATE, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_state_size_normalized", (d4,d5,d6), [constants.STATE, constants.HOUSE, constants.SENATE, constants.INPUT])


d1,d4 = profile_active_state(house_input_df)
d2,d5 = profile_active_state(senate_input_df)
d3,d6 = profile_active_state(input_df)

csv_utils.make_csv_multiple_dicts(dir, "profile_active_state_number", (d1,d2,d3), [constants.STATE, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_state_size", (d4,d5,d6), [constants.STATE, constants.HOUSE, constants.SENATE, constants.INPUT])

### Seniority (Lowest, Highest, Average)

In [15]:
def profile_seniority(group):
    # d = {x_years_in_congress, y_years_in_congres, ...}
    l = []
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for _, (_, off_obj) in group.items(): 
        l.append(off_obj.get_seniority())
    
    l.sort()
    
    d = {}
    d["0. Lowest"] = l[0]
    d["1. Average"] = round((sum(l) / len(l)))
    d["2. Highest"] = l[len(l)-1]

    return d 

d1 = profile_seniority(input_house_officials_objects)
d2 = profile_seniority(input_senate_officials_objects)
d3 = profile_seniority(input_officials_objects)

dir = dir_utils.makesubdir(constants.path_csv, "profile/seniority")
csv_utils.make_csv_multiple_dicts(dir, "profile_seniority", (d1,d2,d3), [constants.SENIORITY, constants.HOUSE, constants.SENATE, constants.INPUT])

def profile_seniority_2(group):
    # d = {x_years_in_congress : #_of_people, }
    d = {}
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for (_, off_obj) in group.values(): 
        d =  dict_utils.increment_dictionary(d, off_obj.get_seniority())
    
    return d 

d1 = profile_seniority_2(input_house_officials_objects)
d2 = profile_seniority_2(input_senate_officials_objects)
d3 = profile_seniority_2(input_officials_objects)

csv_utils.make_csv_multiple_dicts(dir, "profile_seniority_2", (d1,d2,d3), [constants.SENIORITY, constants.HOUSE, constants.SENATE, constants.INPUT])


#### Seniority (Which seniority is most active?) Active = No. of Trades & Size of Transactions. 

In [16]:
def profile_active_seniority(group, normalized=None):
    # {'seniority' : 5_peeps_with_it, ...}
    d_number = {}
    
    # {'seniority' :  [gmean of amount, gmean of amount....] }
    d_size = {}

    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]):
            obj = t_to_obj(t)
            seniority = obj.get_seniority()
            
            d_number =  dict_utils.increment_dictionary(d_number, seniority)
            
            mean =  ptr_utils.get_gmean(t[constants.AMOUNT])             
            d_size = dict_utils.increment_list_in_dictionary(d_size, seniority, mean)
    
    d_size = dict_utils.flatten_gmean(d_size)

    # Normalize
    if normalized: 
        d_number = dict_utils.normalize(d_number, normalized, percent=True)
        d_size = dict_utils.normalize(d_size, normalized, percent=True)

    return d_number, d_size
    
d1,d4 = profile_active_seniority(house_input_df, profile_seniority_2(input_house_officials_objects))
d2,d5 = profile_active_seniority(senate_input_df, profile_seniority_2(input_senate_officials_objects))
d3,d6 = profile_active_seniority(input_df, profile_seniority_2(input_officials_objects))

dir = dir_utils.makesubdir(constants.path_csv, "profile/seniority")
csv_utils.make_csv_multiple_dicts(dir, "profile_active_seniority_number_normalized", (d1,d2,d3), [constants.AGE, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_seniority_size_normalized", (d4,d5,d6), [constants.AGE, constants.HOUSE, constants.SENATE, constants.INPUT])


d1,d4 = profile_active_seniority(house_input_df)
d2,d5 = profile_active_seniority(senate_input_df)
d3,d6 = profile_active_seniority(input_df)

csv_utils.make_csv_multiple_dicts(dir, "profile_active_seniority_number", (d1,d2,d3), [constants.AGE, constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "profile_active_seniority_size", (d4,d5,d6), [constants.AGE, constants.HOUSE, constants.SENATE, constants.INPUT])

### Get_Congress (Lowest, Highest)

### Number of Degrees (Lowest, Highest, Average)

In [17]:
def profile_congress(group):
    lowest = highest = None 
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for (_, off_obj) in group.values():
        res = off_obj.get_congress()
        
        if not lowest or res[0] < lowest:
            lowest = res[0]
    
        if not highest or res[len(res) - 1] > highest:
            highest = res[len(res) - 1]
                    
    d = {}
    d["Lowest Congress"] = lowest
    d["Highest Congress"] = highest

    return d 
                        
d1 = profile_congress(input_house_officials_objects)
d2 = profile_congress(input_senate_officials_objects)
d3 = profile_congress(input_officials_objects)

dir = dir_utils.makesubdir(constants.path_csv, "profile/congress")
csv_utils.make_csv_multiple_dicts(dir, "profile_congress", (d1,d2,d3), ["", constants.HOUSE, constants.SENATE, constants.INPUT])

In [18]:
def profile_degrees(group):    
    # d = {x_degrees, y_degrees, ...}
    l = []
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for (_, off_obj) in group.values(): 
        l.append(off_obj.get_num_of_degrees())
    
    l.sort()
    
    d = {}
    d["0. Lowest"] = l[0]
    d["1. Average"] = round((sum(l) / len(l)))
    d["2. Highest"] = l[len(l)-1]

    return d 
    
d1 = profile_degrees(input_house_officials_objects)
d2 = profile_degrees(input_senate_officials_objects)
d3 = profile_degrees(input_officials_objects)       

dir = dir_utils.makesubdir(constants.path_csv, "profile/degrees")
csv_utils.make_csv_multiple_dicts(dir, "profile_degrees", (d1,d2,d3), ["No. of Degrees", constants.HOUSE, constants.SENATE, constants.INPUT]) 

### JD

In [19]:
def profile_JD(group):
    # d = {x_degrees, y_degrees, ...}
    yes = total = 0 
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for (_, off_obj) in group.values(): 
        if off_obj.has_JD():
            yes += 1 
        total += 1 
        
    d = {}
    
    d["(Raw, Percent)"] = (yes, ptr_utils.make_percent(yes, total))
    
    return d 
    
d1 = profile_JD(input_house_officials_objects)
d2 = profile_JD(input_senate_officials_objects)
d3 = profile_JD(input_officials_objects)        

dir = dir_utils.makesubdir(constants.path_csv, "profile/education")
csv_utils.make_csv_multiple_dicts(dir, "profile_JDs", (d1,d2,d3), ["", constants.HOUSE, constants.SENATE, constants.INPUT]) 

## Transaction Date (transaction_date) 

#### Frequency of Differences between Transaction and Disclosure Date

In [20]:
def frequency_of_differences(group):
    d = {}
    # match = {}
    total = num = 0 

    for _,t in group.iterrows():
        # Negative, X days BEFORE
        # Positive, Y dayas AFTER
        diff = ptr_utils.difference_between_dates(t)      
        total += 1 
        num += diff   
        
        # match = {5 days: {'Tom' : 1313, 'X': 3 , ...}. ..}
        # match =  dict_utils.increment_dictionary_in_dictionary(match, diff, official.get_name(t))
            
        d =  dict_utils.increment_dictionary(d, int(diff))
    
    d["Average"] = ptr_utils.make_percent(num, total)
    
    return d 
    # return dict_utils.sort_dictionary_by_keys(d)

d1 = frequency_of_differences(house_input_df)
d2 = frequency_of_differences(senate_input_df)
d3 = frequency_of_differences(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
csv_utils.make_csv_multiple_dicts(dir, "frequency_of_differences", (d1,d2,d3), ["Difference", constants.HOUSE, constants.SENATE, constants.INPUT])

### The most popular transaction_date for each sector.

In [21]:
def transaction_date_wrt_sector(group, diff):
    # d_prime = {'sector' : {'date' : #_of_transactions, ....} , 'sector2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            sector = dir_utils.search_mapping(sector_df, t[constants.TICKER], sector=True)            
            d_prime =  dict_utils.increment_dictionary_in_dictionary(d_prime, sector, (t[constants.TDATE]))
       
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "transaction_date_wrt_sector"  
    if diff:
        filename += "_" + diff 
    key_header = constants.SECTOR
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_sector(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_sector(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_sector(input_df, constants.INPUT)

               sector transaction_date  number_of_transactions
0  Financial Services       2020/03/18                      63
1   Consumer Cyclical       2020/03/18                      45
2         Industrials       2020/03/18                      37
3          Technology       2019/06/24                      37
4          Healthcare       2019/06/24                      32
               sector transaction_date  number_of_transactions
0  Financial Services       2020/04/14                      30
1                Fund       2020/04/02                      27
2   Consumer Cyclical       2020/04/14                      23
3          Technology       2020/04/14                      20
4         Industrials       2020/04/07                      20
               sector transaction_date  number_of_transactions
0  Financial Services       2020/03/18                      65
1   Consumer Cyclical       2020/03/18                      48
2         Industrials       2020/03/18                 

### The most popular transaction_date for each sector controlling for each official. 

In [22]:
def transaction_date_wrt_sector_controlled(group, diff):    
    # d_prime = { ('ticker' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime = dict_utils.increment_set_in_inner_dictionary(d_prime, t[constants.TICKER], t[constants.TDATE], official.get_name(t))
       
    
    d_prime = dict_utils.flatten_len(d_prime)

    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    
    filename = "transaction_date_wrt_sector_controlled"
    if diff:
        filename += "_" + diff 
     
    key_header = constants.SECTOR
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_sector_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_sector_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_sector_controlled(input_df, constants.INPUT)


  sector transaction_date  number_of_transactions
0   MSFT       2021/04/30                       3
1   TSLA       2021/03/22                       3
2   AAPL       2020/03/23                       3
3    UNH       2020/03/23                       3
4    DIS       2020/03/18                       3
  sector transaction_date  number_of_transactions
0   AAPL       2020/08/27                       3
1   DWDP       2017/09/01                       3
2   AMZN       2020/06/26                       2
3    XOM       2020/04/14                       2
4    CVS       2020/04/14                       2
  sector transaction_date  number_of_transactions
0   AAPL       2021/10/29                       3
1   TSLA       2021/03/22                       3
2   GOOG       2020/04/17                       3
3    RTX       2020/04/07                       3
4   SBUX       2020/03/27                       3


### The most popular transaction_date for each industry.

In [23]:
def transaction_date_wrt_industry(group, diff):    
    # d_prime = {'industry' : {'date' : #_of_transactions, ....} , 'industry2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            industry = dir_utils.search_mapping(industry_df, t[constants.TICKER], industry=True)

            d_prime =  dict_utils.increment_dictionary_in_dictionary(d_prime, industry, (t[constants.TDATE]))
       
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "transaction_date_wrt_industry"
    if diff:
        filename += "_" + diff  
    key_header = constants.INDUSTRY
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_industry(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_industry(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_industry(input_df, constants.INPUT)


                industry transaction_date  number_of_transactions
0                   Fund       2020/10/09                      28
1         Semiconductors       2020/07/13                      24
2  Oil & Gas - Midstream       2020/06/10                      22
3                  Banks       2020/03/18                      19
4   Application Software       2020/02/13                      18
                  industry transaction_date  number_of_transactions
0                     Fund       2020/04/02                      27
1    Oil & Gas - Midstream       2020/04/15                      16
2            Entertainment       2020/04/14                      14
3       Drug Manufacturers       2020/04/14                      11
4  Consumer Packaged Goods       2020/04/07                      10
                industry transaction_date  number_of_transactions
0                   Fund       2020/04/02                      28
1         Semiconductors       2020/07/13                      2

### The most popular transaction_date for each industry controlling for official. 

In [24]:
def transaction_date_wrt_industry_controlled(group, diff):
    
    # d_prime = { ('ticker' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            industry = dir_utils.search_mapping(industry_df, t[constants.TICKER], industry=True)
            
            d_prime = dict_utils.increment_set_in_inner_dictionary(d_prime, industry, t[constants.TDATE],  official.get_name(t))

    d_prime = dict_utils.flatten_len(d_prime)
    
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)
    
    filename = "transaction_date_wrt_industry_controlled"
    if diff:
        filename += "_" + diff  
    key_header = constants.INDUSTRY
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_industry_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_industry_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_industry_controlled(input_df, constants.INPUT)


               industry transaction_date  number_of_transactions
0  Application Software       2020/03/26                       6
1   Brokers & Exchanges       2021/03/01                       4
2   Aerospace & Defense       2020/04/03                       4
3                  Fund       2020/03/10                       4
4        Semiconductors       2021/11/18                       3
            industry transaction_date  number_of_transactions
0  Computer Hardware       2020/08/27                       3
1  Health Care Plans       2020/04/14                       3
2   Asset Management       2020/04/14                       3
3    Credit Services       2020/04/14                       3
4      Entertainment       2020/04/06                       3
               industry transaction_date  number_of_transactions
0  Application Software       2020/03/26                       7
1         Entertainment       2020/04/06                       5
2   Aerospace & Defense       2020/04/03   

### The most popular transaction_date for each ticker. 

In [25]:
def transaction_date_wrt_ticker(group, diff):    
    # d_prime = {'ticker' : {'date' : #_of_transactions, ....} , 'ticker2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime =  dict_utils.increment_dictionary_in_dictionary(d_prime, t[constants.TICKER], (t[constants.TDATE]))
       
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "transaction_date_wrt_ticker"
    if diff:
        filename += "_" + diff
    key_header = constants.TICKER
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_ticker(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_ticker(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_ticker(input_df, constants.INPUT)


  ticker transaction_date  number_of_transactions
0    RUN       2020/07/13                      24
1   MSFT       2020/02/13                      18
2     SO       2020/03/16                      16
3    AMN       2020/09/02                       9
4   CRWD       2021/10/01                       8
  ticker transaction_date  number_of_transactions
0   ECOM       2021/02/10                       8
1      X       2021/05/06                       6
2    OXY       2021/02/16                       6
3    CLF       2021/07/21                       5
4     AA       2021/01/06                       5
  ticker transaction_date  number_of_transactions
0    RUN       2020/07/13                      24
1   MSFT       2020/02/13                      18
2     SO       2020/03/16                      16
3    AMN       2020/09/02                       9
4   CRWD       2021/10/01                       8


### The most popular transaction_date for each ticker controlling for official.

In [26]:
def transaction_date_wrt_ticker_controlled(group, diff):    
    # d_prime = { ('ticker' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}
    
    for _,t in group.iterrows():    
        if ptr_utils.isvalid(t[constants.TICKER]):    
            name = official.get_name(t)
            d_prime = dict_utils.increment_set_in_inner_dictionary(d_prime, t[constants.TICKER], t[constants.TDATE], name)
       
    d_prime = dict_utils.flatten_len_inner_set(d_prime)

    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)
    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    
    filename = "transaction_date_wrt_ticker_controlled"
    if diff:
        filename += "_" + diff  
        
    key_header = constants.TICKER
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    return d 

_ = transaction_date_wrt_ticker_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_ticker_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_ticker_controlled(input_df, constants.INPUT)

### The most popular transaction_date for type.

In [27]:
def transaction_date_wrt_type(group, diff):        
    # d_prime = {'type' : {'date' : #_of_transactions, ....} , 'type2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime =  dict_utils.increment_dictionary_in_dictionary(d_prime, ptr_utils.format_type(t[constants.TYPE]), (t[constants.TDATE]))
       
    # d = {'type' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "transaction_date_wrt_type"
    if diff:
        filename += "_" + diff  
    key_header = constants.TYPE
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df)
    return d 

_ = transaction_date_wrt_type(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_type(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_type(input_df, constants.INPUT)

             type transaction_date  number_of_transactions
0        Purchase       2020/03/18                     204
1     Sale (Full)       2019/06/24                     204
2  Sale (Partial)       2020/11/13                      60
3        Exchange       2020/02/24                      18
             type transaction_date  number_of_transactions
0     Sale (Full)       2020/04/14                     116
1        Purchase       2017/03/16                      78
2  Sale (Partial)       2020/04/14                      26
3        Exchange       2017/09/01                       5
             type transaction_date  number_of_transactions
0        Purchase       2020/03/18                     212
1     Sale (Full)       2019/06/24                     204
2  Sale (Partial)       2020/11/13                      60
3        Exchange       2020/02/24                      18


### The most popular transaction_date for type controlling for official. 

In [28]:
def transaction_date_wrt_type_controlled(group, diff):    
    # d_prime = { ('type' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime = dict_utils.increment_set_in_inner_dictionary(d_prime, t[constants.TYPE], (t[constants.TDATE]), official.get_name(t))
       

    d_prime = dict_utils.flatten_len(d_prime)
            
    # d = {'type' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)
    
    
    filename = "transaction_date_wrt_type_controlled"
    if diff:
        filename += "_" + diff  
    key_header = constants.TYPE
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_type_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_type_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_type_controlled(input_df, constants.INPUT)


             type transaction_date  number_of_transactions
0        Purchase       2020/11/13                      12
1     Sale (Full)       2020/03/23                       9
2  Sale (Partial)       2020/03/18                       6
3        Exchange       2020/04/03                       5
             type transaction_date  number_of_transactions
0     Sale (Full)       2020/07/07                       4
1        Purchase       2019/02/27                       4
2  Sale (Partial)       2020/03/17                       3
3        Exchange       2017/09/01                       3
             type transaction_date  number_of_transactions
0        Purchase       2021/01/15                      12
1     Sale (Full)       2020/03/23                       9
2  Sale (Partial)       2020/03/18                       7
3        Exchange       2020/04/03                       6


### The most popular transaction_date for amount.

In [29]:
def transaction_date_wrt_amount(group, diff):
    # d_prime = {'amount' : {'date' : #_of_transactions, ....} , 'amount1' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime =  dict_utils.increment_dictionary_in_dictionary(d_prime, t[constants.AMOUNT], (t[constants.TDATE]))
       
    # d = {'type' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.add_sort_key_for_amount(d)
    d = dict_utils.sort_dictionary_by_sort_key(d)

    filename = "transaction_date_wrt_amount"
    if diff:
        filename += "_" + diff  
    key_header = constants.TYPE

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, key_header)
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_amount(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_amount(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_amount(input_df, constants.INPUT)


                type  2020/03/18  2020/07/13  2020/11/17  2020/11/19  \
0           $1,001 -           0          12           0           0   
1   $1,001 - $15,000         257           0           0           0   
2   $1,000 - $15,000           0           0           0           0   
3  $15,001 - $50,000           0           0           0          68   
4  $15,000 - $50,000           0           0           0           0   

   2021/01/19  2021/02/12  2021/04/09  2021/04/27  2021/05/20  2021/08/16  \
0           0           0           0           0           0           0   
1           0           0           0           0           0           0   
2           0           0           0           0           4           0   
3           0           0           0           0           0           0   
4           0           0           0           0           3           0   

   2021/11/15  sort_key  
0           0      1001  
1           0     15000  
2           0     15000  


### The most popular transaction_date for amount controlling for official.

In [30]:
def transaction_date_wrt_amount_controlled(group, diff):
    # d_prime = { ('amount' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}
    
    for _,t in group.iterrows():
       d_prime = dict_utils.increment_set_in_inner_dictionary(d_prime, t[constants.AMOUNT], (t[constants.TDATE]), official.get_name(t))
       

    d_prime = dict_utils.flatten_len(d_prime)
        
    # d = {'amount' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.add_sort_key_for_amount(d)
    d = dict_utils.sort_dictionary_by_sort_key(d)
    
    
    filename = "transaction_date_wrt_amount_controlled"
    if diff:
        filename += "_" + diff  
    key_header = constants.AMOUNT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, key_header)
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_amount_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_amount_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_amount_controlled(input_df, constants.INPUT)

              amount  2020/01/09  2020/03/13  2020/03/18  2020/08/07  \
0           $1,001 -           0           0           0           0   
1   $1,001 - $15,000           0           0          13           0   
2   $1,000 - $15,000           0           0           0           0   
3  $15,001 - $50,000           0           0           0           0   
4  $15,000 - $50,000           0           0           0           0   

   2020/09/02  2020/11/17  2021/01/04  2021/01/15  2021/01/22  2021/02/16  \
0           0           0           0           0           0           0   
1           0           0           0           0           0           0   
2           0           0           0           0           0           0   
3           0           0           0           0           0           8   
4           0           0           0           0           0           0   

   2021/02/22  2021/05/20  sort_key  
0           2           0      1001  
1           0           0   

### The most popular transaction_date for each official.

In [31]:
def transaction_date_wrt_official(group, diff):
    # d_prime = {'person1' : {'date' : #_of_transactions, ....} , 'person2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime =  dict_utils.increment_dictionary_in_dictionary(d_prime, official.get_name(t), (t[constants.TDATE]))
       
    # d = {'person' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)


    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "transaction_date_wrt_official"
    if diff:
        filename += "_" + diff 

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [constants.OFFICIAL, constants.TDATE, constants.NUMT])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

transaction_date_wrt_official_res = transaction_date_wrt_official(house_input_df, constants.HOUSE)
transaction_date_wrt_official_res = transaction_date_wrt_official(senate_input_df, constants.SENATE)
transaction_date_wrt_official_res = transaction_date_wrt_official(input_df, constants.INPUT)

            official transaction_date  number_of_transactions
0     Shalala, Donna       2019/06/24                     204
1  Cisneros, Gilbert       2020/03/18                     160
2      Meijer, Peter       2021/02/16                     150
3     Phillips, Dean       2020/04/02                     114
4    Sherrill, Mikie       2020/02/20                     113
               official transaction_date  number_of_transactions
0       Loeffler, Kelly       2020/04/07                     111
1  Perdue Jr., David A.       2020/04/14                     110
2     Tillis, Thomas R.       2015/02/13                      93
3         Murray, Patty       2017/06/15                      83
4     Collins, Susan M.       2014/05/07                      64
            official transaction_date  number_of_transactions
0     Shalala, Donna       2019/06/24                     204
1  Cisneros, Gilbert       2020/03/18                     160
2      Meijer, Peter       2021/02/16               

In [32]:
def num_of_trans_per_date(group):
    d={}

    for _,t in group.iterrows():
        d =  dict_utils.increment_dictionary(d, (t[constants.TDATE]))

    return d 

d1 = num_of_trans_per_date(house_input_df)
d2 = num_of_trans_per_date(senate_input_df)
d3 = num_of_trans_per_date(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
csv_utils.make_csv_multiple_dicts(dir, "num_of_trans_per_date", (d1,d2,d3), [constants.TDATE, constants.HOUSE, constants.SENATE, constants.INPUT])


### Frequency of Transactions per Date Controlled
_Number of transactions per date controlled by official. E.g. if Ted Baker made 40 transactions on 1/1/02 and Sam Wall made 2 transactions on 1/1/02, we conclude that there were two transactions on 1/1/02._


In [33]:
def num_of_trans_per_date_controlled(group):    
    d = {}

    for _,t in group.iterrows():
        d =  dict_utils.increment_dictionary_in_dictionary(d, (t[constants.TDATE]), official.get_name(t))

    return dict_utils.flatten_len(d, inner_set=True)
    
d1 = num_of_trans_per_date_controlled(house_input_df)
d2 = num_of_trans_per_date_controlled(senate_input_df)
d3 = num_of_trans_per_date_controlled(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
csv_utils.make_csv_multiple_dicts(dir, "num_of_trans_per_date_controlled", (d1,d2,d3), [constants.TDATE, constants.HOUSE, constants.SENATE, constants.INPUT])


### Tax

#### Number of Transactions Within 2 Weeks Prior to Quarterly Tax Date 

In [34]:
def num_of_trans_within_tax_date(group):        
        total = within = 0 
        d = {}

        for _,t in group.iterrows():  
                total += 1 
                if ptr_utils.within_tax_date(t[constants.TDATE]):
                        within += 1 

        d["(No. of transactions within 2 weeks of tax deadline, %)"] = (within,  ptr_utils.make_percent(within, total))

        return d 

d1 = num_of_trans_within_tax_date(house_input_df)
d2 = num_of_trans_within_tax_date(senate_input_df)
d3 = num_of_trans_within_tax_date(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
csv_utils.make_csv_multiple_dicts(dir, "num_of_trans_within_tax_date", (d1,d2,d3), ["", constants.HOUSE, constants.SENATE, constants.INPUT])

#### Number of Transactions Within 2 Weeks Prior to Quarterly Tax Date Semi-Controlled 

_Given dict='09/03/2021': {'Thomas H Tuberville': 1, 'Cynthia M Lummis': 1, 'A. Mitchell Mcconnell, Jr.': 1}...I only incremement the number of within (tax date) once per date per official. So, if an official does 100 transactions on a date within two weeks of a quarterly deadline, then I only count it as one transaction._

_A Note: total === number of transactions per person per date (so not really all transactions) because someone could have potentially made 60 transactions on one date which we don't include in neither total or within, if applicable._

In [35]:
def num_of_trans_within_tax_date_controlled(group):
        total = within = 0 
        people = set()
        d = {}

        for _, t in group.iterrows():
                name = official.get_name(t)
                if ptr_utils.within_tax_date(t[constants.TDATE]) and name not in people:
                        people.add(name)
                        within += 1 
                total += 1         
                
        d["(No. of transactions within 2 weeks of tax deadline, %)"] = (within,  ptr_utils.make_percent(within, total))

        return d

d1 = num_of_trans_within_tax_date_controlled(house_input_df)
d2 = num_of_trans_within_tax_date_controlled(senate_input_df)
d3 = num_of_trans_within_tax_date_controlled(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
csv_utils.make_csv_multiple_dicts(dir, "num_of_trans_within_tax_date_controlled", (d1,d2,d3), ["", constants.HOUSE, constants.SENATE, constants.INPUT])

In [36]:
# def people_and_within_tax_date(people):        
#         # todo get number of senators. 
#         # todo is the monetary value of that equal!!!! 
#         d = {}
#         for i in people:
#                 d[i] = ""
                
#         d = dict_utils.sort_dictionary_by_keys(d)
        
#         dir = dir_utils.makesubdir(constants.path_csv, "transaction_date/tax")
#         wd = csv_utils.make_csv(dir, "people_and_within_tax_date_list", d, ["Officials"])
#         df = pd.read_csv(wd)
#         print("People who posted transactions within two weeks of quarterly tax deadline:\n {}\n".format(df.head(5)))

#         print("Number of people who posted transactions within two weeks of quarterly tax deadline: {}\n".format(len(people)))
        
#         party = {}
#         for p in people:
#                 link = search.get_wiki_link(p)
#                 _, obj = input_officials_objects[link]
#                 party =  dict_utils.increment_dictionary(party, obj.party)
                
#         party = dict_utils.sort_dictionary_by_values(party)
        
#         wd = csv_utils.make_csv(dir, "people_and_within_tax_date_list_w_aff", party, ["party", "number_of_filing_within_tax_date"])
#         df = pd.read_csv(wd)
#         print("Party breakdown of people who posted transactions within two weeks of quarterly tax deadline:\n {}\n".format(df.head(5)))

# people_and_within_tax_date(num_of_trans_within_tax_date_controlled_res_house)
# people_and_within_tax_date(num_of_trans_within_tax_date_controlled_res_senate)
# people_and_within_tax_date(num_of_trans_within_tax_date_controlled_res_input)

In [37]:
# def people_and_within_tax_date_how_often(people):

#         d = {}
#         d_controlled_by_dates = {}
        
#         for _,t in input_df.iterrows():
#                 if official.get_canonical_name(t[title]) in people and ptr_utils.within_tax_date(t[constants.TDATE]):
#                         d =  dict_utils.increment_dictionary(d, t[title])
#                         d_controlled_by_dates =  dict_utils.increment_dictionary_in_dictionary(d_controlled_by_dates, t[constants.TDATE], t[title])

#         d_controlled_by_dates_res  = {}
#         for date in d_controlled_by_dates:
#                 for person in d_controlled_by_dates[date]:
#                         d_controlled_by_dates_res =  dict_utils.increment_dictionary(d_controlled_by_dates_res, person)

#         d = dict_utils.sort_dictionary_by_values(d)
#         d_controlled_by_dates_res = dict_utils.sort_dictionary_by_values(d_controlled_by_dates_res)

#         dir = dir_utils.makesubdir(constants.path_csv, "transaction_date/tax")
#         wd = csv_utils.make_csv(dir, "people_and_within_tax_date_how_often", d, [title, "number_of_filing_within_tax_date"])
#         df = pd.read_csv(wd)
#         print("People who posted transactions within two weeks of quarterly tax deadline and the number of transactions posted:\n {}\n".format(df.head(5)))

#         wd = csv_utils.make_csv(dir, "people_and_within_tax_date_how_often_date_controlled", d_controlled_by_dates_res, [title, "number_of_filing_within_tax_date_date_controlled"])
#         df = pd.read_csv(wd)
#         print("People who posted transactions within two weeks of quarterly tax deadline and the number of transactions posted controlled by date:\n {}\n".format(df.head(5)))

          
# people_and_within_tax_date_how_often(num_of_trans_within_tax_date_controlled_res)



## Owner (owner) 

### Frequency Count of Owner

In [38]:
def freq_count_of_owner(group):
    # d = {'Joint' : 5}
    d = {}

    for _,t in group.iterrows():
        if constants.OWNER in t and ptr_utils.isvalid(t[constants.OWNER]) :
            d =  dict_utils.increment_dictionary(d, t[constants.OWNER].capitalize())
        
    return dict_utils.sort_dictionary_by_values(d)
    
    
# {link : (canonical_name_input_based, official_object), ... }
d1 = freq_count_of_owner(house_input_df)
d2 = freq_count_of_owner(senate_input_df)

# {link : (canonical_name_input_based, official_object) ... }
d3 = freq_count_of_owner(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.OWNER)
csv_utils.make_csv_multiple_dicts(dir, "freq_count_of_owner", (d1,d2,d3), ["Owner", constants.HOUSE, constants.SENATE, constants.INPUT]) 

In [39]:
def freq_count_by_spouse(group):
    # d = {'x_spouse' : 5}
    d = {}

    for _,t in group.iterrows():
        if constants.OWNER in t and ptr_utils.isvalid(t[constants.OWNER]) and t[constants.OWNER].capitalize() == 'Spouse':
            d =  dict_utils.increment_dictionary(d, official.get_name(t))
        
    return d
    
    
# {link : (canonical_name_input_based, official_object), ... }
d1 = freq_count_by_spouse(house_input_df)
d2 = freq_count_by_spouse(senate_input_df)

# {link : (canonical_name_input_based, official_object) ... }
d3 = freq_count_by_spouse(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.OWNER)
csv_utils.make_csv_multiple_dicts(dir, "freq_count_by_spouse", (d1,d2,d3), ["Owner", constants.HOUSE, constants.SENATE, constants.INPUT]) 

## Ticker (ticker) 

### Number of Tickers

In [40]:
def num_of_tickers(group):
    # d = {'ticker' : #_of_times }
    d = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d =  dict_utils.increment_dictionary(d, t[constants.TICKER])
       

    return dict_utils.sort_dictionary_by_values(d, reverse=True)

d1 = num_of_tickers(house_input_df)
d2 = num_of_tickers(senate_input_df)
d3 = num_of_tickers(input_df)
    
dir = dir_utils.makesubdir(constants.path_csv, constants.TICKER)
csv_utils.make_csv_multiple_dicts(dir, "num_of_tickers", (d1,d2,d3), [constants.TICKER, constants.HOUSE, constants.SENATE, constants.INPUT])

### Frequency of Ticker per Year

In [42]:
def frequency_of_ticker_breakdown_year(group, diff):
    d = {}

    # {"ticker" : {"year" : number, "year" : number, ...}}
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d = dict_utils.increment_dictionary_in_dictionary(d, t[constants.TICKER], ptr_utils.get_year(t[constants.TDATE]))
            d = dict_utils.increment_dictionary_in_dictionary(d, t[constants.TICKER], 9999)

    d = dict_utils.sort_dictionary_by_keys(d)
    filename = "frequency_of_ticker_breakdown_year"
    if diff:
        filename += "_" + diff
    key_header = constants.TICKER
    
    
    dir = dir_utils.makesubdir(constants.path_csv, constants.TICKER)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))

frequency_of_ticker_breakdown_year(house_input_df, constants.HOUSE)
frequency_of_ticker_breakdown_year(senate_input_df, constants.SENATE)
frequency_of_ticker_breakdown_year(input_df, constants.INPUT)

   ticker  2018  2019  2020  2021  9999
0  35G.SG     0     0     0     2     2
1     7XY     0     0     0     1     1
    ticker  2012  2013  2014  2015  2016  2017  2018  2019  2020  2021  9999
0  0QZI.IL     0     0     0     0     0     0     0     2     0     0     2
1  3V64.TI     0     0     0     0     0     0     0     2     0     0     2
    ticker  2012  2013  2014  2015  2016  2017  2018  2019  2020  2021  9999
0  0QZI.IL     0     0     0     0     0     0     0     2     0     0     2
1   35G.SG     0     0     0     0     0     0     0     0     0     2     2


### Frequency of Ticker per Date

In [43]:
def frequency_of_ticker_by_date(group, diff):
    # {ticker : {date : ___}}
    d = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d = dict_utils.increment_dictionary_in_dictionary(d, t[constants.TICKER], t[constants.TDATE])

    
    d = dict_utils.flatten_best(d)
    

    filename = "frequency_of_ticker_by_date"
    if diff:
        filename += "_" + diff
    
    dir = dir_utils.makesubdir(constants.path_csv, constants.TICKER)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, constants.TICKER)
    print(pd.read_csv(wd).head(2))

d1 = frequency_of_ticker_by_date(house_input_df, constants.HOUSE)
d2 = frequency_of_ticker_by_date(senate_input_df, constants.SENATE)
d3 = frequency_of_ticker_by_date(input_df, constants.INPUT)

  ticker  2018/12/27  2019/01/09  2019/01/10  2019/01/15  2019/01/23  \
0     BP           0           0           0           0           0   
1    XOM           0           0           0           0           0   

   2019/01/30  2019/02/01  2019/02/05  2019/02/06  ...  2021/12/15  \
0           0           0           0           0  ...           0   
1           0           0           0           0  ...           0   

   2021/12/16  2021/12/17  2021/12/20  2021/12/21  2021/12/22  2021/12/23  \
0           0           0           0           0           0           0   
1           0           0           0           0           0           0   

   2021/12/28  2021/12/30  2021/12/31  
0           0           0           0  
1           0           0           0  

[2 rows x 479 columns]
  ticker  2013/04/29  2013/05/01  2014/01/29  2014/02/06  2014/02/19  \
0    NVS           0           0           0           0           0   
1     VZ           0           0           0        

## Industry & Sector 

#### Number of Transactions per Industry
_Not controlled in any way._

In [44]:
def number_of_transactions_per_indusry(group):        
    d = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            industry = dir_utils.search_mapping(industry_df, t[constants.TICKER], industry=True)
            d = dict_utils.increment_dictionary(d, industry)

    return dict_utils.sort_dictionary_by_values(d)
    

d1 = number_of_transactions_per_indusry(house_input_df)
d2 = number_of_transactions_per_indusry(senate_input_df)
d3 = number_of_transactions_per_indusry(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.INDUSTRY)
csv_utils.make_csv_multiple_dicts(dir, "number_of_transactions_per_indusry", (d1,d2,d3), [constants.INDUSTRY, constants.HOUSE, constants.SENATE, constants.INPUT])



def number_of_transactions_per_sector(group):        
    d = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            sector = dir_utils.search_mapping(sector_df, t[constants.TICKER], sector=True)
            d = dict_utils.increment_dictionary(d, sector)

    return dict_utils.sort_dictionary_by_values(d)
    

d1 = number_of_transactions_per_sector(house_input_df)
d2 = number_of_transactions_per_sector(senate_input_df)
d3 = number_of_transactions_per_sector(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.SECTOR)
csv_utils.make_csv_multiple_dicts(dir, "number_of_transactions_per_sector", (d1,d2,d3), [constants.SECTOR, constants.HOUSE, constants.SENATE, constants.INPUT])

#### Industry Breakdown per Official

In [45]:
def frequency_of_industry_breakdown_official(group, diff):
    d = {}

    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            industry = dir_utils.search_mapping(industry_df, t[constants.TICKER], industry=True)
            d = dict_utils.increment_dictionary_in_dictionary(d, official.get_name(t), industry)

    d = dict_utils.sort_dictionary_by_keys(d)
    
    filename = "frequency_of_industry_breakdown_official"
    if diff:
        filename += "_" + diff
    dir = dir_utils.makesubdir(constants.path_csv, constants.INDUSTRY)

    wd = csv_utils.make_csv_breakdown(dir, filename, d, constants.OFFICIAL)
    print(pd.read_csv(wd).head(2))
    
    return d 

d1 = frequency_of_industry_breakdown_official(house_input_df, constants.HOUSE)
d2 = frequency_of_industry_breakdown_official(senate_input_df, constants.SENATE)
d3 = frequency_of_industry_breakdown_official(input_df, constants.INPUT)



def frequency_of_sector_breakdown_official(group, diff):
    d = {}

    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            sector = dir_utils.search_mapping(sector_df, t[constants.TICKER], sector=True)
            d = dict_utils.increment_dictionary_in_dictionary(d, official.get_name(t), sector)

    d = dict_utils.sort_dictionary_by_keys(d)
    
    filename = "frequency_of_sector_breakdown_official"
    if diff:
        filename += "_" + diff
    dir = dir_utils.makesubdir(constants.path_csv, constants.SECTOR)

    wd = csv_utils.make_csv_breakdown(dir, filename, d, constants.OFFICIAL)
    print(pd.read_csv(wd).head(2))
    
    return d 

d1 = frequency_of_sector_breakdown_official(house_input_df, constants.HOUSE)
d2 = frequency_of_sector_breakdown_official(senate_input_df, constants.SENATE)
d3 = frequency_of_sector_breakdown_official(input_df, constants.INPUT)

            official  Advertising & Marketing Services  Advertising Agencies  \
0  Allen, Richard W.                                 0                     0   
1      Amash, Justin                                 0                     0   

   Aerospace & Defense  Agricultural Inputs  Agriculture  Airlines  \
0                    1                    0            0         0   
1                    0                    0            0         0   

   Apparel Manufacturing  Apparel Retail  Application Software  ...  Trust  \
0                      0               0                     1  ...      0   
1                      0               0                     0  ...      0   

   Utilities - Independent Power Producers  Utilities - Regulated  \
0                                        0                      3   
1                                        0                      0   

   Utilities - Regulated Water  Utilities—Diversified  \
0                            0                  

#### Frequency of Industry per Year

In [46]:
def frequency_of_industry_breakdown(group, diff):
    d = {}

    for _,t in group.iterrows():     
        if ptr_utils.isvalid(t[constants.TICKER]):
            industry = dir_utils.search_mapping(industry_df, t[constants.TICKER], industry=True)  
            d = dict_utils.increment_dictionary_in_dictionary(d, industry, ptr_utils.get_year(t[constants.TDATE]))

    d = dict_utils.sort_dictionary_by_keys(d)
    
    filename = "frequency_of_industry_breakdown"
    if diff:
        filename += "_" + diff 
    key_header = "industry"
    
    
    dir = dir_utils.makesubdir(constants.path_csv, constants.INDUSTRY)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))
    return d 

d1 = frequency_of_industry_breakdown(house_input_df, constants.HOUSE)
d2 = frequency_of_industry_breakdown(senate_input_df, constants.SENATE)
d3 = frequency_of_industry_breakdown(input_df, constants.INPUT)


def frequency_of_sector_breakdown(group, diff):
    d = {}

    for _,t in group.iterrows():     
        if ptr_utils.isvalid(t[constants.TICKER]):
            sector = dir_utils.search_mapping(sector_df, t[constants.TICKER], sector=True)  
            d = dict_utils.increment_dictionary_in_dictionary(d, sector, ptr_utils.get_year(t[constants.TDATE]))

    d = dict_utils.sort_dictionary_by_keys(d)
    
    filename = "frequency_of_sector_breakdown"
    if diff:
        filename += "_" + diff 
    key_header = "sector"
    
    
    dir = dir_utils.makesubdir(constants.path_csv, constants.SECTOR)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))
    return d 

d1 = frequency_of_sector_breakdown(house_input_df, constants.HOUSE)
d2 = frequency_of_sector_breakdown(senate_input_df, constants.SENATE)
d3 = frequency_of_sector_breakdown(input_df, constants.INPUT)

                           industry  2018  2019  2020  2021
0  Advertising & Marketing Services     0     3     6     1
1              Advertising Agencies     0     5     5     0
                           industry  2012  2013  2014  2015  2016  2017  2018  \
0  Advertising & Marketing Services     0     0     0     1     0     0     0   
1               Aerospace & Defense     0     0    15    12     8    19    33   

   2019  2020  2021  
0     0     0     0  
1    37    28     4  
                           industry  2012  2013  2014  2015  2016  2017  2018  \
0  Advertising & Marketing Services     0     0     0     1     0     0     0   
1              Advertising Agencies     0     0     0     0     0     0     0   

   2019  2020  2021  
0     3     6     1  
1     5     5     0  
             sector  2018  2019  2020  2021
0  Banks - Regional     0     0     1     0
1   Basic Materials     0    52   177   143
                   sector  2012  2013  2014  2015  2016  2017  2018 

## Asset Description (asset_description) and Comment (comment) 

### Number of Options

In [47]:
def number_of_options(group):
    count = total = 0 
    d = {}
    # [this_person_placed_an_option, ...]
    people = set()

    for _,t in group.iterrows():
        if constants.ASSET_DESCRIPTION in t and ptr_utils.isvalid(t[constants.ASSET_DESCRIPTION]) and ("Option" in t[constants.ASSET_DESCRIPTION] or "option" in t[constants.ASSET_DESCRIPTION]): 
            count += 1 
            people.add(official.get_name(t))
        total += 1 

    d["(No. of Options, %)"] = (count, ptr_utils.make_percent(count, total))

    return d 
            
d1 = number_of_options(house_input_df)
d2 = number_of_options(senate_input_df)
d3 = number_of_options(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.COMMENT)
csv_utils.make_csv_multiple_dicts(dir, "number_of_options", (d1,d2,d3), ["", constants.HOUSE, constants.SENATE, constants.INPUT])    

### Number of Scanned PDFs.

In [48]:
def number_of_scanned_pdfs(group):
    count = total = 0 
    d = {}

    for _,t in group.iterrows():
        if t[constants.ASSET_DESCRIPTION] == constants.DISCLOSED:
            count += 1 
        total += 1 
            
    d["(No. of Scanned PDFS, %)"] = (count, ptr_utils.make_percent(count, total))

    return d 
            
d1 = number_of_scanned_pdfs(house_input_df)
d2 = number_of_scanned_pdfs(senate_input_df)
d3 = number_of_scanned_pdfs(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.COMMENT)
csv_utils.make_csv_multiple_dicts(dir, "number_of_scanned_pdfs", (d1,d2,d3), ["", constants.HOUSE, constants.SENATE, constants.INPUT])    

## Asset Type (asset_type)

### Frequency of Asset Type

In [49]:
def frequency_of_asset_type(group):
    d = {}

    for _,t in group.iterrows():
        if constants.ATYPE in t and ptr_utils.isvalid(t[constants.ATYPE]):
            d = dict_utils.increment_dictionary(d, t[constants.ATYPE])
      
    return dict_utils.sort_dictionary_by_values(d)

d1 = frequency_of_asset_type(house_input_df)
d2 = frequency_of_asset_type(senate_input_df)
d3 = frequency_of_asset_type(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.ATYPE)
csv_utils.make_csv_multiple_dicts(dir, "frequency_of_asset_type", (d1,d2,d3), [constants.ATYPE, constants.HOUSE, constants.SENATE, constants.INPUT])

## Amount 

### Frequency of Amount by Person

In [50]:
def frequency_of_amount_by_persom(group, diff):
    d = {}

    for _,t in group.iterrows():
        d = dict_utils.increment_dictionary_in_dictionary(d, official.get_name(t), t[constants.AMOUNT])
    
    d = dict_utils.sort_dictionary_by_keys(d)
    
    filename = "frequency_of_amount_by_person"
    if diff:
        filename += "_" + diff 
        
    key_header = constants.AMOUNT
    
    dir = dir_utils.makesubdir(constants.path_csv, constants.AMOUNT)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))
    
    return d 
   
    
d1 = frequency_of_amount_by_persom(house_input_df, constants.HOUSE)
d2 = frequency_of_amount_by_persom(senate_input_df, constants.SENATE)
# d3 = frequency_of_amount_by_persom(input_df)

              amount  $1,000 - $15,000  $1,000,000 +  $1,000,001 - $5,000,000  \
0  Allen, Richard W.                 0             0                        0   
1      Amash, Justin                 0             0                        0   

   $1,001 -  $1,001 - $15,000  $100,001 - $250,000  $15,000 - $50,000  \
0         0                15                    0                  0   
1         0                 2                    0                  0   

   $15,001 - $50,000  $250,001 - $500,000  $5,000,001 - $25,000,000  \
0                 21                    0                         0   
1                  1                    0                         0   

   $50,000,000 +  $50,001 - $100,000  $500,001 - $1,000,000  
0              0                   2                      0  
1              0                   0                      0  
              amount  $1,000,001 - $5,000,000  $1,001 - $15,000  \
0   Alexander, Lamar                        1                 0   
1 

### Frequency of Transactions by Amount

In [51]:
def frequency_of_amount_total(group):
    d = {}        

    for _,t in group.iterrows():
        amount = t[constants.AMOUNT]
        if amount in constants.AMOUNT_CONSISTENCY:
            amount = constants.AMOUNT_CONSISTENCY[amount]
                    
        d = dict_utils.increment_dictionary(d, amount)

    return d
    
d1 = frequency_of_amount_total(house_input_df)
d2 = frequency_of_amount_total(senate_input_df)
d3 = frequency_of_amount_total(input_df)


dir = dir_utils.makesubdir(constants.path_csv, constants.AMOUNT)
csv_utils.make_csv_multiple_dicts(dir, "frequency_of_amount_total", (d1,d2,d3), [constants.AMOUNT, constants.HOUSE, constants.SENATE, constants.INPUT])

### Frequency of Amount by Gender 

In [52]:
def frequency_of_amount_by_gender(group, diff, normalized=None):
    d = {}
    
    for _,t in group.iterrows():
        name = official.get_name(t)
        link = input_all_officials_name[name]
        d = dict_utils.increment_dictionary_in_dictionary(d, t[constants.AMOUNT], all_officials_gender[link])

    d = dict_utils.add_sort_key_for_amount(d)
    d = dict_utils.sort_dictionary_by_sort_key(d)

        
    filename = "frequency_of_amount_by_gender"
    if diff:
        filename += "_" + diff
    
    dir = dir_utils.makesubdir(constants.path_csv, constants.AMOUNT)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, constants.AMOUNT)
    print(pd.read_csv(wd).head(2))
    return d 
   
    
d1 = frequency_of_amount_by_gender(house_input_df, constants.HOUSE, len(input_house_officials_link))
d2 = frequency_of_amount_by_gender(senate_input_df, constants.SENATE, len(input_senate_officials_link))
d3 = frequency_of_amount_by_gender(input_df, constants.INPUT, len(input_all_officials_link))

             amount  female  male  sort_key
0          $1,001 -       7   235      1001
1  $1,001 - $15,000    2807  6479     15000
             amount  female  male  sort_key
0           Unknown      87   378        -1
1  $1,001 - $15,000     893  4942     15000
     amount  female  male  sort_key
0   Unknown      87   378        -1
1  $1,001 -       7   235      1001


### Frequency of Transactions by Political Affiliation and Amount

In [53]:
def frequency_of_amount_by_aff(group, diff):
    d = {}

    for _,t in group.iterrows():
        obj = t_to_obj(t)
        amount = t[constants.AMOUNT]
        if amount in constants.AMOUNT_CONSISTENCY:
            amount = constants.AMOUNT_CONSISTENCY[t[constants.AMOUNT]]
            
        d = dict_utils.increment_dictionary_in_dictionary(d, amount, obj.party)


    d = dict_utils.add_sort_key_for_amount(d)
    d = dict_utils.sort_dictionary_by_sort_key(d)
    
        
    filename = "frequency_of_amount_by_aff"
    if diff:
        filename += "_" + diff
    
    dir = dir_utils.makesubdir(constants.path_csv, constants.AMOUNT)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, constants.AMOUNT)
    print(pd.read_csv(wd).head(2))
    
    return d 
   
    
d1 = frequency_of_amount_by_aff(house_input_df, constants.HOUSE)
d2 = frequency_of_amount_by_aff(senate_input_df, constants.SENATE)
d3 = frequency_of_amount_by_aff(input_df, constants.INPUT) 

             amount  Democratic  Libertarian  Republican  sort_key
0          $1,001 -         236            0           6      1001
1  $1,001 - $15,000        6833            2        2455     15000
             amount  Democratic  Independent  Republican  sort_key
0           Unknown         189            8         268        -1
1  $1,001 - $15,000        1993           51        3791     15000
     amount  Democratic  Independent  Libertarian  Republican  sort_key
0   Unknown         189            8            0         268        -1
1  $1,001 -         236            0            0           6      1001


### Average For Buys and Sells per Official 

In [54]:
def average_per_person(group, diff):
    d={}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]) and (t[constants.TYPE] == constants.PURCHASE or t[constants.TYPE] == constants.SALE):
                        
            mean = ptr_utils.get_gmean(t[constants.AMOUNT])

            # if t[constants.TYPE] == constants.PURCHASE: 
            #     mean = -mean 

            d = dict_utils.increment_list_in_dictionary(d, official.get_name(t), mean)


    d = dict_utils.flatten_gmean(d)        
            
    filename = "average_per_person"
    if diff:
        filename += "_" + diff

    key_header = constants.OFFICIAL 
    value_header = "average_size_of_transactions"

    d = dict_utils.sort_dictionary_by_values(d)
    d = dict_utils.commify(d)

    dir = dir_utils.makesubdir(constants.path_csv, constants.AMOUNT)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))

    return d 

d1 = average_per_person(house_input_df, constants.HOUSE)
d2 = average_per_person(senate_input_df, constants.SENATE)
d3 = average_per_person(input_df, constants.INPUT) 

           official average_size_of_transactions
0  Matsui, Doris O.                    6,529,007
1     Pelosi, Nancy                      498,092
2  Peters, Scott H.                      475,089
3       Wagner, Ann                      179,838
4     Moulton, Seth                      169,410
             official average_size_of_transactions
0         Scott, Rick                      488,349
1     Warner, Mark R.                      232,220
2  Bennet, Michael F.                      139,067
3        Hoeven, John                       85,128
4   Crapo, Michael D.                       73,576
           official average_size_of_transactions
0  Matsui, Doris O.                   19,526,288
1     Pelosi, Nancy                      498,092
2       Scott, Rick                      488,349
3  Peters, Scott H.                      435,463
4   Warner, Mark R.                      232,220


### Average Activity

In [55]:
def average_activity(group, diff):
    d={}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]):
                        
            mean = ptr_utils.get_gmean(t[constants.AMOUNT])

            d = dict_utils.increment_list_in_dictionary(d, official.get_name(t), mean)


    d = dict_utils.flatten_gmean(d)        
            
    filename = "average_activity"
    if diff:
        filename += "_" + diff

    key_header = constants.OFFICIAL 
    value_header = "average_size_of_transactions"

    d = dict_utils.sort_dictionary_by_values(d)
    d = dict_utils.commify(d)

    dir = dir_utils.makesubdir(constants.path_csv, constants.AMOUNT)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))

    return d 

d1 = average_activity(house_input_df, constants.HOUSE)
d2 = average_activity(senate_input_df, constants.SENATE)
d3 = average_activity(input_df, constants.INPUT) 

                official average_size_of_transactions
0          Raskin, Jamie                    2,750,540
1       Matsui, Doris O.                    1,431,627
2          Pelosi, Nancy                      558,256
3  Schneider, Bradley S.                      359,331
4       Peters, Scott H.                      342,258
             official average_size_of_transactions
0        Johnson, Ron                    1,526,383
1         Scott, Rick                      487,942
2  Marshall, Roger W.                      367,879
3  Bennet, Michael F.                      194,779
4     Warner, Mark R.                      134,027
           official average_size_of_transactions
0     Raskin, Jamie                   73,575,888
1  Matsui, Doris O.                    2,369,337
2      Johnson, Ron                    1,526,383
3     Pelosi, Nancy                      558,256
4       Scott, Rick                      487,942


## Types (type)

### Types of Actions Total

In [56]:
def frequency_of_act(group):
    d = {}

    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TYPE]): 
            d = dict_utils.increment_dictionary(d, t[constants.TYPE])
    
    return dict_utils.sort_dictionary_by_values(d)
    
     
d1 = frequency_of_act(house_input_df)
d2 = frequency_of_act(senate_input_df)
d3 = frequency_of_act(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.TYPE)
csv_utils.make_csv_multiple_dicts(dir, "frequency_of_act", (d1,d2,d3), [constants.TYPE, constants.HOUSE, constants.SENATE, constants.INPUT])

### Types of Transactions per Person

In [63]:
def types_of_transactions_per_person(group, diff, normalized=None):
    d={}
    
    for _,t in group.iterrows():
        d = dict_utils.increment_dictionary_in_dictionary(d, official.get_name(t), t[constants.TYPE])
        d = dict_utils.increment_dictionary_in_dictionary(d, official.get_name(t), constants.TOTAL)


    if normalized: 
        for k,v in d.items():
            newinner = {}
            for ik, iv in v.items():
                link = input_all_officials_name[k]
                _, obj = input_officials_objects[link]
                newinner[ik] = round(iv/obj.get_seniority(), 2)
                
            d[k] = newinner
    
    filename = "types_of_transactions_per_person"
    if diff:
        filename += "_" + diff
    if normalized:
        filename += "_normalized"

    d = dict_utils.sort_dictionary_by_keys(d)

    dir = dir_utils.makesubdir(constants.path_csv, constants.TYPE)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, constants.OFFICIAL)
    df = pd.read_csv(wd)
    print(df.head(5))

    return d 

d1 = types_of_transactions_per_person(house_input_df, constants.HOUSE)
d2 = types_of_transactions_per_person(senate_input_df, constants.SENATE)
d3 = types_of_transactions_per_person(input_df, constants.INPUT)

d1 = types_of_transactions_per_person(house_input_df, constants.HOUSE, normalized=True)
d2 = types_of_transactions_per_person(senate_input_df, constants.SENATE, normalized=True)
d3 = types_of_transactions_per_person(input_df, constants.INPUT, normalized=True)

              official  Exchange  Purchase  Sale (Full)  Sale (Partial)  total
0    Allen, Richard W.         0        23           14               1     38
1        Amash, Justin         0         3            0               0      3
2  Arenholz, Ashley H.         0         4           10               0     14
3        Axne, Cynthia         5       113           43              13    174
4      Banks, James E.         0        28           16               8     52
              official  Exchange  Purchase  Sale (Full)  Sale (Partial)  total
0     Alexander, Lamar         0        36           21               2     59
1    Barrasso, John A.         0         0            1               0      1
2   Bennet, Michael F.         1         5            1               1      8
3  Blumenthal, Richard         0         0            0               0     50
4        Blunt, Roy D.         0         1            7               0     10
              official  Exchange  Purchase  Sale (Fu

## Frequency

### Number of Transactions per Year

In [60]:
def num_of_trans_per_year(group, normalized=None):
    d={}
    
    for _,t in group.iterrows():
        d =  dict_utils.increment_dictionary(d, ptr_utils.get_year(t[constants.TDATE]))

    if normalized:
        d2 = {}
        for k,v in d.items():
            d2[k] = v/normalized
            
        
    return dict_utils.sort_dictionary_by_values(d), dict_utils.sort_dictionary_by_values(d2)

d1, d4 = num_of_trans_per_year(house_input_df, len(input_house_officials_objects))
d2, d5 = num_of_trans_per_year(senate_input_df, len(input_senate_officials_objects))
d3, d6 = num_of_trans_per_year(input_df, len(input_officials_objects))

dir = dir_utils.makesubdir(constants.path_csv, constants.FREQ)
csv_utils.make_csv_multiple_dicts(dir, "num_of_trans_per_year", (d1,d2,d3), ["year", constants.HOUSE, constants.SENATE, constants.INPUT])
csv_utils.make_csv_multiple_dicts(dir, "num_of_trans_per_year_normalized", (d4,d5,d6), ["year", constants.HOUSE, constants.SENATE, constants.INPUT])

### Number of Transactions per Person 
_Not controlling for number of years in position or size of transaction._ For each official, we want their total number of transactions.  {'Sam': 5, 'Alex': 2424, ...}

In [61]:
def num_of_trans_per_person(group):
    d={}
    
    for _,t in group.iterrows():
        d = dict_utils.increment_dictionary(d, official.get_name(t))

    return dict_utils.sort_dictionary_by_values(d)
    
d1 = num_of_trans_per_person(house_input_df)
d2 = num_of_trans_per_person(senate_input_df)    
d3 = num_of_trans_per_person(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.PROFILE)
csv_utils.make_csv_multiple_dicts(dir, "num_of_trans_per_person", (d1,d2,d3), [constants.OFFICIAL, constants.HOUSE, constants.SENATE, constants.INPUT])

### Number of Transactions per Person Controlled
_Divide number of transactions by number of years in official position.  Not controlling for size of transaction._

In [62]:
def num_of_trans_per_person_controlled(group):    
    d={}
    
    for _,t in group.iterrows():
        d = dict_utils.increment_dictionary(d, official.get_name(t))
        
    for k,v in d.items():
        obj = t_to_obj(t)
        d[k] = int(v/obj.get_seniority()) if int(v/obj.get_seniority()) != 0 else 1 

    return dict_utils.sort_dictionary_by_values(d)


d1 = num_of_trans_per_person_controlled(house_input_df)
d2 = num_of_trans_per_person_controlled(senate_input_df)
d3 = num_of_trans_per_person_controlled(input_df)

dir = dir_utils.makesubdir(constants.path_csv, constants.FREQ)

csv_utils.make_csv_multiple_dicts(dir, "num_of_trans_per_person_controlled", (d1, d2, d3), [constants.OFFICIAL, constants.HOUSE, constants.SENATE, constants.INPUT])