In [1]:
import utils.csv_utils as csv_utils 
import utils.dir_utils as dir_utils
import utils.dict_utils as dict_utils 
import utils.ptr_utils as ptr_utils
import utils.constants as constants 
import helpers.official as official
import helpers.search as search
import helpers.congress as congress
import pandas as pd 
from scipy.stats.mstats import gmean

In [2]:
_, input_df = dir_utils.get_data(combined=True)
_, house_input_df = dir_utils.get_data(house=True)
_, senate_input_df = dir_utils.get_data(senate=True)

num_of_transactions = input_df.shape[0]
num_of_house_transactions = house_input_df.shape[0]
num_of_senate_transactions = senate_input_df.shape[0]

sector_df = dir_utils.get_mapping(sector=True)
industry_df = dir_utils.get_mapping(industry=True)

# {canonical_name_input_based : link, ...}
input_all_officials_name = {}

# {link : canonical_name_input_based, ....}
input_all_officials_link = {}
input_house_officials_link = {}
input_senate_officials_link = {}

# (canonical_name_input_based, ...)
names = set()

for _,t in input_df.iterrows():        
    name = official.get_name(t)
        
    if name not in names:    
        link = search.get_wiki_link(name)
        
        if ptr_utils.isvalid(t[constants.REPRESENTATIVE]) and link not in input_house_officials_link:
            input_house_officials_link = dict_utils.increment_dictionary(input_house_officials_link, link, name, not_math=True)
        if ptr_utils.isvalid(t[constants.SENATOR]) and link not in input_senate_officials_link:
            input_senate_officials_link = dict_utils.increment_dictionary(input_senate_officials_link, link, name, not_math=True)
        
        input_all_officials_link = dict_utils.increment_dictionary(input_all_officials_link, link, name, not_math=True)
        input_all_officials_name = dict_utils.increment_dictionary(input_all_officials_name, name, link, not_math=True)

        names.add(name)

print("Number of transactions: {} \n".format(ptr_utils.commify_str(len(input_df.index))))

print("Number of transactions by House Representatives: {}, {}".format(ptr_utils.commify_str(num_of_house_transactions), ptr_utils.make_percent(num_of_house_transactions, len(input_df.index))))
print("Number of transactions by House Representatives controlled: {0:.2f} transactions per representative \n".format((num_of_house_transactions / len(input_house_officials_link))))

print("Number of transactions by Senators: {}, {}".format(ptr_utils.commify_str(num_of_senate_transactions), ptr_utils.make_percent(num_of_senate_transactions, len(input_df.index))))
print("Number of transactions by Senators controlled: {0:.2f} transactions per senator \n".format( (num_of_senate_transactions /  len(input_senate_officials_link))))

# {link : (canonical_name_input_based, official_object), ... }
input_house_officials_objects = {}
for link, person in input_house_officials_link.items(): 
    off = search.wiki_search(person)        
    input_house_officials_objects[link] = (person, off)
        
# {link : (canonical_name_input_based, official_object), ... }
input_senate_officials_objects = {}
for link, person in input_senate_officials_link.items():
    off = search.wiki_search(person)        
    input_senate_officials_objects[link] = (person, off)

# {link : (canonical_name_input_based, official_object) ... }
input_officials_objects = {**input_house_officials_objects, **input_senate_officials_objects}

# {link : canonical_name_wiki_based, ... }
all_officials = congress.get_all_officials()
house_officials = congress.get_house_officials()
senate_officials = congress.get_senate_officials()

# {'California' :  #_of_representatives_from_112_to_117, ...}
all_officials_state_count = congress.get_officials_party(everyone=all_officials.values())

congress_objects = []
for i in range(112, 118):
    c = search.get_congress(i)
    congress_objects.append(c)

# {link : canonical_name_wiki_based, ... }
all_officials_not_in_input = dict(all_officials)

for link_input in input_all_officials_link.keys():
    del all_officials_not_in_input[link_input]
    
print("Number of officials in input: {}".format(len(input_all_officials_link)))
print("Number of officials in input controlled: {}\n".format(ptr_utils.make_percent(len(input_all_officials_link), len(all_officials))))

print("Number of representatives in input: {}, {}".format(len(input_house_officials_link), ptr_utils.make_percent(len(input_house_officials_link), len(input_all_officials_link))))
print("Number of representatives in input controlled: {} \n".format(ptr_utils.make_percent(len(input_house_officials_link), len(house_officials))))

print("Number of senators in input: {}, {}".format(len(input_senate_officials_link), ptr_utils.make_percent(len(input_senate_officials_link), len(input_all_officials_link))))
print("Number of senators in input controlled: {} \n".format(ptr_utils.make_percent(len(input_senate_officials_link), len(senate_officials))))

print("Number of officials in total (from 112-117th congress): {}".format(ptr_utils.commify_str(len(all_officials))))
print("Number of representatives in total (from 112-117th congress): {}".format(ptr_utils.commify_str(len(house_officials))))
print("Number of senators in total (from 112-117th congress): {}".format(ptr_utils.commify_str(len(senate_officials))))

print("Number of officials from 112-117th congress who did NOT engage in the market: {}, {} \n".format(len(all_officials_not_in_input), ptr_utils.make_percent(len(all_officials_not_in_input), len(all_officials) ) ))

Number of transactions: 22,115 

Number of transactions by House Representatives: 13,074, 59.12%
Number of transactions by House Representatives controlled: 83.27 transactions per representative 

Number of transactions by Senators: 9,041, 40.88%
Number of transactions by Senators controlled: 143.51 transactions per senator 



In [None]:
def validate_dates():
    for _, t in input_df.iterrows():
        canonical_name = official.get_name(t)
        
        link = input_all_officials_name[canonical_name]
        _, obj = input_officials_objects[link]
        
        if not ptr_utils.validate_date(obj, t[constants.TDATE]):
            print(canonical_name, t[constants.TDATE], t[constants.TDATE])
        if not ptr_utils.validate_date(obj, t[constants.DDATE], ddate=True):
            print(canonical_name, t[constants.DDATE], t[constants.DDATE])

validate_dates()

# Main

## Profile 

### Age (Lowest, Highest, Average)

In [None]:
def profile_age(group, type):
    assert type 
    
    # {age, age, ...}
    l = []
    
    # group = {link : (canonical_name_input_based, official_object) ... }
    for _, (_, off_obj) in group.items(): 
        l.append(off_obj.get_age())
    
    l.sort()
    
    d = {}
    d["Youngest"] = l[0]
    d["Average"] = round((sum(l) / len(l)))
    d["Oldest"] = l[len(l)-1]
    
    # print("Youngest Age for {}: {}".format(type, l[0]))
    # print("Average Age for {}: {}".format(type, round((sum(l) / len(l)))))
    # print("Oldest Age for {}: {}\n".format(type, l[len(l)-1]))
    
    return d 
    
d1 = profile_age(input_house_officials_objects, constants.HOUSE)
d2 = profile_age(input_senate_officials_objects, constants.SENATE)
d3 = profile_age(input_officials_objects, constants.INPUT)

csv_utils.make_csv_multiple_dicts(constants.PROFILE, "profile_age", d1+d2+d3, [constants.HOUSE, constants.SENATE, constants.INPUT])

### Oldest and Most Recent Dates (transaction and disclosure)

In [None]:
def profile_dates(group, type):
    assert type 
    
    lowest_tdate = lowest_ddate = highest_tdate = highest_ddate = None
    lowest_tdate_obj = lowest_ddate_obj = highest_tdate_obj = highest_ddate_obj = None
    
    for _,t in group.iterrows():
        curr = t[constants.TDATE]            
        if not lowest_tdate or curr < lowest_tdate:
            lowest_tdate = curr 
            lowest_tdate_obj = t
        if not highest_tdate or curr > highest_tdate:
            highest_tdate = curr  
            highest_tdate_obj = t

        curr = t[constants.DDATE]
        if not lowest_ddate or curr < lowest_ddate:
            lowest_ddate = curr 
            lowest_ddate_obj = t
        if not highest_ddate or curr > highest_ddate:
            highest_ddate = curr 
            highest_ddate_obj = t

    print("Oldest transaction_date for {}: {} \n {}".format(type, lowest_tdate, lowest_tdate_obj[constants.PTR_LINK]))
    print("Most recent transaction_date for {}: {} \n {} ".format(type, highest_tdate, highest_tdate_obj[constants.PTR_LINK]))
    
    print("Oldest disclosure_date for {}: {} \n {}".format(type, lowest_ddate, lowest_ddate_obj[constants.PTR_LINK]))
    print("Most recent disclosure_date for {}: {} \n {}\n".format(type, highest_ddate, highest_ddate_obj[constants.PTR_LINK]))
    

profile_dates(house_input_df, constants.HOUSE)
profile_dates(senate_input_df, constants.SENATE)
profile_dates(input_df, constants.INPUT)

### Gender

In [None]:
def get_gender(group, type):
    # d_prime = {'Female' : set(Officials), 'Male' : set(Officials), ...}
    d_prime = {}

    for link, name in group.items(): 
        gender = official.get_gender(name, link=link)
        if gender == 'unknown':
            print(name)
        d_prime = dict_utils.increment_set_in_dictionary(d_prime, gender, name)

    # d = {'Female' : #_of_officials, 'Male' : #_of_officials, ...}
    d = dict_utils.flatten_len(d_prime, inner_set=True)
    
    d = dict_utils.sort_dictionary_by_values(d)
    
    # key_header = constants.GENDER
    # value_header = "number_of_officials_" + type 
    # filename = "num_of_off_per_gender_" + type 

    # dir = dir_utils.makesubdir(constants.path_csv, constants.PROFILE)
    # csv_utils.make_csv(dir, filename, d, [key_header, value_header])
    
    return d

# {link : canonical_name_input_based, ....}
d1 = get_gender(input_house_officials_link, constants.HOUSE)
d2 = get_gender(input_senate_officials_link, constants.SENATE)
d3 = get_gender(input_all_officials_link, constants.INPUT)

# {link : canonical_name_wiki_based, ... }
d4 = get_gender(all_officials, constants.TOTAL)

csv_utils.make_csv_multiple_dicts(constants.PROFILE, "profile_gender", d1+d2+d3+d4, [constants.HOUSE, constants.SENATE, constants.INPUT, constants.TOTAL])

### Party

In [None]:
def profile_party(group, type):
    assert type 
    
    # d_prime = {'Republican' : set(Officials), 'Democrat' : set(Officials), ...}
    d_prime = {}
    
    for _, off_obj in group.values(): 
        d_prime = dict_utils.increment_set_in_dictionary(d_prime, off_obj.party, off_obj.name)
        
    # d = {'Republican' : #_of_officials, 'Democrat' : #_of_officials, ...}
    d = dict_utils.flatten_len(d_prime, inner_set=True)
                    
    d = dict_utils.sort_dictionary_by_values(d)
    
    # key_header = constants.PARTY
    # value_header = "number_of_officials_" + type
    # filename = "num_of_off_per_party_" + type 

    # dir = dir_utils.makesubdir(constants.path_csv, constants.PROFILE)
    # csv_utils.make_csv(dir, filename, d, [key_header, value_header])

    return d

# {link : (canonical_name_input_based, official_object), ... }
d1 = profile_party(input_house_officials_objects, constants.HOUSE)
d2 = profile_party(input_senate_officials_objects, constants.SENATE)

# {link : (canonical_name_input_based, official_object) ... }
d3 = profile_party(input_officials_objects, constants.INPUT)


def profile_party_total():
   
    # d = {'Republican' : #_of_officials, 'Democrat' : #_of_officials, ...}
    d = {}
    for c in congress_objects:
        for k,v in c.get_senate_party().items():
            d = dict_utils.increment_dictionary(d, k, v)
        for k,v in c.get_house_party().items():
            d = dict_utils.increment_dictionary(d, k, v)

    d = dict_utils.sort_dictionary_by_values(d)
    
    # key_header = constants.PARTY
    # value_header = "number_of_officials"
    # filename = "num_of_off_per_party_total"

    # dir = dir_utils.makesubdir(constants.path_csv, constants.PROFILE)
    # csv_utils.make_csv(dir, filename, d, [key_header, value_header])

    return d

d4 = profile_party_total()

csv_utils.make_csv_multiple_dicts(constants.PROFILE, "profile_party", d1+d2+d3+d4, [constants.HOUSE, constants.SENATE, constants.INPUT, constants.TOTAL])

### State

In [None]:
def profile_state(group, type):
    assert type 
    
    # d_prime = {'Maryland' : set(Officials), 'California' : set(Officials), ...}
    d_prime = {}
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for _, off_obj in group.values(): 
        d_prime = dict_utils.increment_set_in_dictionary(d_prime, off_obj.state, off_obj.name)

    # d = {'Maryland' : #_of_officials, 'California' : #_of_officials, ...}
    d = dict_utils.flatten_len(d_prime, inner_set=True)
    
    for state in d: 
        num_of_officials = d[state]
        d[state]["controlled"] = num_of_officials / all_officials_state_count[state]
    
    d = dict_utils.sort_dictionary_by_values(d)
    
    # key_header = constants.STATE
    # value_header = "number_of_officials"
    # value_header2 = "number_of_officials_controlled"
    # filename = "num_of_off_per_state_" + type 

    # dir = dir_utils.makesubdir(constants.path_csv, constants.PROFILE)
    # wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    # df = pd.read_csv(wd)
    # print(df.head(5))
    
    # print("Number of states represented: {}\n".format(len(d)))
    
    return d 

# {link : (canonical_name_input_based, official_object), ... }
d1 = profile_state(input_house_officials_objects, constants.HOUSE)
d2 = profile_state(input_senate_officials_objects, constants.SENATE)

# {link : (canonical_name_input_based, official_object) ... }
d3 = profile_state(input_officials_objects, constants.INPUT)

csv_utils.make_csv_multiple_dicts(constants.PROFILE, "profile_state", d1+d2+d3, [constants.HOUSE, constants.SENATE, constants.INPUT])

### Seniority (Lowest, Highest, Average)

In [None]:
def profile_seniority(group, type):
    assert type 
    
    # d = {x_years_in_congress, y_years_in_congres, ...}
    l = []
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for _, (_, off_obj) in group.items(): 
        l.append(off_obj.get_seniority())
    
    l.sort()
    
    d = {}
    d["Youngest"] = l[0]
    d["Average"] = round((sum(l) / len(l)))
    d["Oldest"] = l[len(l)-1]

    return d 
    # print("Lowest Seniority for {}: {}".format(type, l[0]))
    # print("Average Seniority for {}: {}".format(type, round((sum(l) / len(l)))))
    # print("Highest Seniority for {}: {}\n".format(type, l[len(l)-1]))
    
d1 = profile_seniority(input_house_officials_objects, constants.HOUSE)
d2 = profile_seniority(input_senate_officials_objects, constants.SENATE)
d3 = profile_seniority(input_officials_objects, constants.INPUT)

csv_utils.make_csv_multiple_dicts(constants.PROFILE, "profile_seniority", d1+d2+d3, [constants.HOUSE, constants.SENATE, constants.INPUT])

### Get_Congress (Lowest, Highest)

In [None]:
def profile_congress(group, type):
    assert type 
    
    lowest = highest = None 
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for _, (_, off_obj) in group.items():
        res = off_obj.get_congress()
        
        if not lowest or res[0] < lowest:
            lowest = res[0]
    
        if not highest or res[len(res) - 1] > highest:
            highest = res[len(res) - 1]
                    
    d = {}
    d["Lowest Congress"] = lowest
    d["Highest Congress"] = highest

    return d 
    # print("Lowest Congress for {}: {}".format(type, lowest))
    # print("Highest Congress for {}: {}\n".format(type, highest))
                        
d1 = profile_congress(input_house_officials_objects, constants.HOUSE)
d2 = profile_congress(input_senate_officials_objects, constants.SENATE)
d3 = profile_congress(input_officials_objects, constants.INPUT)

csv_utils.make_csv_multiple_dicts(constants.PROFILE, "profile_congress", d1+d2+d3, [constants.HOUSE, constants.SENATE, constants.INPUT])

### Number of Degrees (Lowest, Highest, Average)

In [None]:
def profile_degrees(group, type):
    assert type 
    
    # d = {x_degrees, y_degrees, ...}
    l = []
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for _, (_, off_obj) in group.items(): 
        l.append(off_obj.get_num_of_degrees())
    
    l.sort()
    
    d = {}
    d["Youngest"] = l[0]
    d["Average"] = round((sum(l) / len(l)))
    d["Oldest"] = l[len(l)-1]

    return d 

    # print("Lowest Number of Degrees for {}: {}".format(l[0]))
    # print("Average Number of Degrees for {}: {}".format( int((sum(l) / len(l)))))
    # print("Highest Number of Degrees for {}: {}\n".format(l[len(l)-1]))
    
d1 = profile_degrees(input_house_officials_objects, constants.HOUSE)
d2 = profile_degrees(input_senate_officials_objects, constants.SENATE)
d3 = profile_degrees(input_officials_objects, constants.INPUT)       

csv_utils.make_csv_multiple_dicts(constants.PROFILE, "profile_degrees", d1+d2+d3, [constants.HOUSE, constants.SENATE, constants.INPUT]) 

### JD

In [None]:
def profile_JD(group, type):
    # d = {x_degrees, y_degrees, ...}
    yes = total = 0 
    
    # input_officials_objects = {link : (canonical_name_input_based, official_object) ... }
    for _, (_, off_obj) in group.items(): 
        if off_obj.has_JD():
            yes += 1 
        total += 1 
        
    d = {}
    d["Percent"] = ptr_utils.make_percent(yes, total)
    
    return d 

    # print("Percant that have JDs for {}: {}%\n".format(type, ptr_utils.make_percent(yes, total)))
    
d1 = profile_JD(input_house_officials_objects, constants.HOUSE)
d2 = profile_JD(input_senate_officials_objects, constants.SENATE)
d3 = profile_JD(input_officials_objects, constants.INPUT)        

csv_utils.make_csv_multiple_dicts(constants.PROFILE, "profile_JDs", d1+d2+d3, [constants.HOUSE, constants.SENATE, constants.INPUT]) 

## Transaction Date (transaction_date)

### The most popular transaction_date for each sector.

In [None]:
def transaction_date_wrt_sector(group, type):
    assert type 

    # d_prime = {'sector' : {'date' : #_of_transactions, ....} , 'sector2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            sector = dir_utils.search_mapping(sector_df, t[constants.TICKER], sector=True)            
            d_prime = dict_utils.increment_dictionary_in_dictionary(d_prime, sector, (t[constants.TDATE]))
       
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "most_popular_td_fe_sector_" + type 
    key_header = constants.SECTOR
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_sector(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_sector(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_sector(input_df, constants.INPUT)


### The most popular transaction_date for each sector controlling for each official. 

In [None]:
def transaction_date_wrt_sector_controlled(group, type):
    assert group 
    
    # d_prime = { ('ticker' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}

    
    for _,t in type.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            industry = dir_utils.search_mapping(sector_df, t[constants.TICKER], sector=True)
            d_prime = dict_utils.increment_set_in_dictionary(d_prime, industry, t[constants.TDATE]), official.get_canonical_name(t[title]))
       

    for ticker in d_prime:
        for date in d_prime[ticker]:
            d_prime[ticker][date] = len(d_prime[ticker][date])
            
    
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten(d_prime)


    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    
    filename = "most_popular_td_fe_sector_controlled_" + type 
    key_header = constants.SECTOR
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_sector_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_sector_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_sector_controlled(input_df, constants.INPUT)


### The most popular transaction_date for each industry.

In [None]:
def transaction_date_wrt_industry(group, type):
    assert type 
    
    # d_prime = {'industry' : {'date' : #_of_transactions, ....} , 'industry2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            industry = dir_utils.search_mapping(industry_df, t[constants.TICKER], industry=True)

            d_prime = dict_utils.increment_dictionary_in_dictionary(d_prime, industry, (t[constants.TDATE]))
       
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "most_popular_td_fe_industry_" + type 
    key_header = constants.INDUSTRY
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_industry(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_industry(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_industry(input_df, constants.INPUT)


### The most popular transaction_date for each industry controlling for official. 

In [None]:
def transaction_date_wrt_industry_controlled(group, type):
    assert type 
    
    # d_prime = { ('ticker' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            industry = dir_utils.search_mapping(industry_df, t[constants.TICKER], industry=True)
            
            d_prime = dict_utils.increment_set_in_dictionary(d_prime, industry, t[constants.TDATE]), official.get_canonical_name(t[title]))
       

    for ticker in d_prime:
        for date in d_prime[ticker]:
            d_prime[ticker][date] = len(d_prime[ticker][date])
            
    
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten(d_prime)


    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)
    
    
    filename = "most_popular_td_fe_industry_controlled_" + type 
    key_header = constants.INDUSTRY
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_industry_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_industry_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_industry_controlled(input_df, constants.INPUT)


### The most popular transaction_date for each ticker. 

In [None]:
def transaction_date_wrt_ticker(group, type):
    assert type 
    
    # d_prime = {'ticker' : {'date' : #_of_transactions, ....} , 'ticker2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime = dict_utils.increment_dictionary_in_dictionary(d_prime, t[constants.TICKER], (t[constants.TDATE]))
       
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten(d_prime)


    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "most_popular_td_fe_ticker_" + type 
    key_header = constants.TICKER
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_ticker(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_ticker(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_ticker(input_df, constants.INPUT)


### The most popular transaction_date for each ticker controlling for official.

In [None]:
def transaction_date_wrt_ticker_controlled(group, type):
    assert type 
    
    # d_prime = { ('ticker' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}
    
    for _,t in group.iterrows():        
        name = official.get_name(t)
        d_prime = dict_utils.increment_set_in_dictionary(d_prime, t[constants.TICKER], (t[constants.TDATE]), name)
       
    
    # d = {'ticker' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_len(d_prime)
    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    
    filename = "most_popular_td_fe_ticker_controlled_" + type 
    key_header = constants.TICKER
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_ticker_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_ticker_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_ticker_controlled(input_df, constants.INPUT)


### The most popular transaction_date for type.

In [None]:
def transaction_date_wrt_type(group, type):
    assert type 
        
    # d_prime = {'type' : {'date' : #_of_transactions, ....} , 'type2' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime = dict_utils.increment_dictionary_in_dictionary(d_prime, ptr_utils.format_type(t[constants.TYPE]), (t[constants.TDATE]))
       
    # d = {'type' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "most_popuar_td_for_type_" + type 
    key_header = constants.TYPE
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df)
    return d 

_ = transaction_date_wrt_type(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_type(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_type(input_df, constants.INPUT)


### The most popular transaction_date for type controlling for official. 

In [None]:
def transaction_date_wrt_type_controlled(group, type):
    assert type 
    
    # d_prime = { ('type' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}

    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime = dict_utils.increment_set_in_dictionary(d_prime, t[constants.TYPE], (t[constants.TDATE]), official.get_canonical_name(t[title]))
       

    for ticker in d_prime:
        for date in d_prime[ticker]:
            d_prime[ticker][date] = len(d_prime[ticker][date])
            
    
    # d = {'type' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)


    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)
    
    
    filename = "most_popular_td_fe_type_controlled_" + type 
    key_header = constants.TYPE
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_type_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_type_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_type_controlled(input_df, constants.INPUT)


### The most popular transaction_date for amount.

In [None]:
def transaction_date_wrt_amount(group, type):
    assert type 
    
    # d_prime = {'amount' : {'date' : #_of_transactions, ....} , 'amount1' : .... }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime = dict_utils.increment_dictionary_in_dictionary(d_prime, t[constants.AMOUNT], (t[constants.TDATE]))
       
    # d = {'type' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten(d_prime)

    d = dict_utils.add_sort_key_for_amount(d)
    d = dict_utils.sort_dictionary_by_sort_key(d)

    filename = "most_popuar_td_for_amount_" + type 
    key_header = constants.TYPE

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, key_header)
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_amount(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_amount(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_amount(input_df, constants.INPUT)


### The most popular transaction_date for amount controlling for official.

In [None]:
def transaction_date_wrt_amount_controlled(group, type):
    assert type 

    # d_prime = { ('amount' : {'date' : set(people_who_traded_on_that_day) , ....} , ....}
    d_prime = {}
    
    for _,t in group.iterrows():
       d_prime = dict_utils.increment_set_in_dictionary(d_prime, t[constants.AMOUNT], (t[constants.TDATE]), official.get_canonical_name(t[title]))
       

    for ticker in d_prime:
        for date in d_prime[ticker]:
            d_prime[ticker][date] = len(d_prime[ticker][date])
            
    
    # d = {'amount' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten_best(d_prime)

    d = dict_utils.add_sort_key_for_amount(d)
    d = dict_utils.sort_dictionary_by_sort_key(d)
    
    
    filename = "most_popular_td_fe_amount_controlled_" + type 
    key_header = constants.AMOUNT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv_breakdown(dir, filename, d, key_header)
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_amount_controlled(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_amount_controlled(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_amount_controlled(input_df, constants.INPUT)

### The most popular transaction_date for each official.

In [None]:
def transaction_date_wrt_official():

    # d_prime = {'person1' : {'date' : #_of_transactions, ....} , 'person2' : .... }
    d_prime = {}
    
    for _,t in input_df.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime = dict_utils.increment_dictionary_in_dictionary(d_prime, official.get_canonical_name(t[title]), (t[constants.TDATE]))
       
    # d = {'person' : {'best_date' : #_of_transactions}, .... }
    d = dict_utils.flatten(d_prime)


    d = dict_utils.sort_dictionary_by_inner_values(d, reverse=True)

    filename = "most_popular_td_fe_official"
    key_header = title
    value_header = constants.TDATE
    value_header2 = constants.NUMT

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

transaction_date_wrt_official_res = transaction_date_wrt_official()

### The average amount size of transactions (i.e., activity) for each transaction_date

In [None]:
def dates_and_size_of_amount(group, type):
    assert type 
    
    d={}
    
    for _, t in group.iterrows():
        if ptr_utils.isvalid(t[constants.AMOUNT]):
            
            # if 'Purchase' in transaction['type']:
                
            # if 'Sale' in transaction['type']:
            d = dict_utils.increment_dictionary_in_dictionary(d, (t[constants.TDATE]), ptr_utils.average_amount(t[constants.AMOUNT]))


    d2 = {}
    for date in d:
        l = []
        for amount in d[date]:   
            l.append(d[date][amount]*amount)
            
        d2[date] = l 
        
        
    for date in d2:
        d2[date] = int(gmean(d2[date]))
    
    filename = "dates_and_size_of_amount_" + type 
    key_header = "date" 
    value_header = "average_size_of_transactions"

    d2 = dict_utils.sort_dictionary_by_values(d2)
    d2 = dict_utils.commify(d2)

    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d2, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))

    return d2 

_ = dates_and_size_of_amount(house_input_df, constants.HOUSE)
_ = dates_and_size_of_amount(senate_input_df, constants.SENATE)
_ = dates_and_size_of_amount(input_df, constants.INPUT)


### Number of Transactions per Person by Date 

In [None]:
def num_of_trans_per_person_per_date(group, type):
    assert type 

    d={}

    for _, t in group.iterrows():
        d = dict_utils.increment_dictionary(d, (t[constants.TDATE]))
        

    filename = "num_of_trans_per_person_per_date_" + type 
    key_header = "date"
    value_header = "number_of_transactions"
    
    d = dict_utils.sort_dictionary_by_values(d)
    
    dir = dir_utils.makesubdir(constants.path_csv, constants.TDATE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = num_of_trans_per_person_per_date(house_input_df, constants.HOUSE)
_ = num_of_trans_per_person_per_date(senate_input_df, constants.SENATE)
_ = num_of_trans_per_person_per_date(input_df, constants.INPUT)


### Frequency of Transactions per Date Controlled
_Number of transactions per date controlled by official. E.g. if Ted Baker made 40 transactions on 1/1/02 and Sam Wall made 2 transactions on 1/1/02, we conclude that there were two transactions on 1/1/02._


In [None]:
def num_of_trans_per_date_controlled(group, type):
    assert type 
    
    d = {}

    for _, t in group.iterrows():
        name = official.get_name(t)
        d = dict_utils.increment_dictionary_in_dictionary(d, (t[constants.TDATE]), name)
    
    d2 = {}
    for date in d:
        d2[date] =  len(d[date])
        
    d2 = dict_utils.sort_dictionary_by_values(d2)
        
    filename = "num_of_trans_per_date_controlled_" + type 
    key_header = "date"
    value_header = "number_of_transactions_unique"


    dir = dir_utils.makesubdir(constants.path_csv, "transaction_date")
    wd = csv_utils.make_csv(dir, filename, d2, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 
    
_ = num_of_trans_per_date_controlled(house_input_df, constants.HOUSE)
_ = num_of_trans_per_date_controlled(senate_input_df, constants.SENATE)
_ = num_of_trans_per_date_controlled(input_df, constants.INPUT)


# CODE ABOVE NEEDS TO BE RUN,

# HERE

### Tax

#### Number of Transactions Within 2 Weeks Prior to Quarterly Tax Date 

In [None]:
def num_of_trans_within_tax_date(group, type):
        assert type 
        
        total = within = 0 

        for _,t in group.items():  
                total += 1 
                if ptr_utils.within_tax_date(t[constants.TDATE]):
                        within += 1 

        print("Percent of transactions posted within two weeks of quarterly tax deadline for {}: {}%".format(type, ptr_utils.make_percent(within, total)))
        
        return 

_ = num_of_trans_within_tax_date(house_input_df, constants.HOUSE)
_ = num_of_trans_within_tax_date(senate_input_df, constants.SENATE)
_ = num_of_trans_within_tax_date(input_df, constants.INPUT)

#### Number of Transactions Within 2 Weeks Prior to Quarterly Tax Date Semi-Controlled 

_Given dict='09/03/2021': {'Thomas H Tuberville': 1, 'Cynthia M Lummis': 1, 'A. Mitchell Mcconnell, Jr.': 1}...I only incremement the number of within (tax date) once per date per official. So, if an official does 100 transactions on a date within two weeks of a quarterly deadline, then I only count it as one transaction._

_A Note: total === number of transactions per person per date (so not really all transactions) because someone could have potentially made 60 transactions on one date which we don't include in neither total or within, if applicable._

In [None]:
def num_of_trans_within_tax_date_controlled(group, type):
        assert type 
        
        total = within = 0 
        people = set()

        for date, inner_dict in group.items():
                if ptr_utils.within_tax_date(date):
                    for person in inner_dict:
                                people.add(person)
                                within += 1 
                                total += 1 
                else:
                        for person in inner_dict:
                                total += 1 
     
     
        print("Percent of transactions posted within two weeks of quarterly tax deadline: {percent}%".format(percent=str((within/total)*100)[:5]))
        return people

num_of_trans_within_tax_date_controlled_res_house = num_of_trans_within_tax_date_controlled(house_input_df, constants.HOUSE)
num_of_trans_within_tax_date_controlled_res_senate = num_of_trans_within_tax_date_controlled(senate_input_df, constants.SENATE)
num_of_trans_within_tax_date_controlled_res_input = num_of_trans_within_tax_date_controlled(input_df, constants.INPUT)

In [None]:
def people_and_within_tax_date(people):        
        # todo get number of senators. 
        # todo is the monetary value of that equal!!!! 
        d = {}
        for i in people:
                d[i] = ""
                
        d = dict_utils.sort_dictionary_by_keys(d)
        
        dir = dir_utils.makesubdir(constants.path_csv, "transaction_date/tax")
        wd = csv_utils.make_csv(dir, "people_and_within_tax_date_list", d, ["Officials"])
        df = pd.read_csv(wd)
        print("People who posted transactions within two weeks of quarterly tax deadline:\n {}\n".format(df.head(5)))

        print("Number of people who posted transactions within two weeks of quarterly tax deadline: {}\n".format(len(people)))
        
        party = {}
        for p in people:
                link = search.get_wiki_link(p)
                _, obj = input_officials_objects[link]
                party = dict_utils.increment_dictionary(party, obj.party)
                
        party = dict_utils.sort_dictionary_by_values(party)
        
        wd = csv_utils.make_csv(dir, "people_and_within_tax_date_list_w_aff", party, ["party", "number_of_filing_within_tax_date"])
        df = pd.read_csv(wd)
        print("Party breakdown of people who posted transactions within two weeks of quarterly tax deadline:\n {}\n".format(df.head(5)))

people_and_within_tax_date(num_of_trans_within_tax_date_controlled_res_house)
people_and_within_tax_date(num_of_trans_within_tax_date_controlled_res_senate)
people_and_within_tax_date(num_of_trans_within_tax_date_controlled_res_input)

In [None]:
def people_and_within_tax_date_how_often(people):

        d = {}
        d_controlled_by_dates = {}
        
        for _, t in input_df.iterrows():
                if official.get_canonical_name(t[title]) in people and ptr_utils.within_tax_date(t[constants.TDATE]):
                        d = dict_utils.increment_dictionary(d, t[title])
                        d_controlled_by_dates = dict_utils.increment_dictionary_in_dictionary(d_controlled_by_dates, t[constants.TDATE], t[title])

        d_controlled_by_dates_res  = {}
        for date in d_controlled_by_dates:
                for person in d_controlled_by_dates[date]:
                        d_controlled_by_dates_res = dict_utils.increment_dictionary(d_controlled_by_dates_res, person)

        d = dict_utils.sort_dictionary_by_values(d)
        d_controlled_by_dates_res = dict_utils.sort_dictionary_by_values(d_controlled_by_dates_res)

        dir = dir_utils.makesubdir(constants.path_csv, "transaction_date/tax")
        wd = csv_utils.make_csv(dir, "people_and_within_tax_date_how_often", d, [title, "number_of_filing_within_tax_date"])
        df = pd.read_csv(wd)
        print("People who posted transactions within two weeks of quarterly tax deadline and the number of transactions posted:\n {}\n".format(df.head(5)))

        wd = csv_utils.make_csv(dir, "people_and_within_tax_date_how_often_date_controlled", d_controlled_by_dates_res, [title, "number_of_filing_within_tax_date_date_controlled"])
        df = pd.read_csv(wd)
        print("People who posted transactions within two weeks of quarterly tax deadline and the number of transactions posted controlled by date:\n {}\n".format(df.head(5)))

          
people_and_within_tax_date_how_often(num_of_trans_within_tax_date_controlled_res)



## Owner (owner)

### Frequency Count of Owner

In [None]:
def freq_count_of_owner(group, type):
    assert type 
    
    # d = {'Joint' : 5}
    d = {}

    for _, t in group.iterrows():
        d = dict_utils.increment_dictionary(d, t[constants.OWNER].capitalize())
        
    
    d = dict_utils.sort_dictionary_by_values(d)
    
    key_header = constants.STATE
    value_header = "number_of_officials"
    value_header2 = "number_of_officials_controlled"
    filename = "num_of_off_per_state_" + type 

    dir = dir_utils.makesubdir(constants.path_csv, constants.PROFILE)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header, value_header2])
    df = pd.read_csv(wd)
    print(df.head(5))
    
    print("Number of states represented: {}".format(len(d)))


# {link : (canonical_name_input_based, official_object) ... }
profile_state(input_officials_objects, "INPUT")

# {link : (canonical_name_input_based, official_object), ... }
profile_state(input_senate_officials_objects, "SENATE")
profile_state(input_house_officials_objects, "HOUSE")
    
    

## Ticker (ticker)

### Number of Tickers

In [None]:
def num_of_tickers(group, type):
    assert type 

    # d_prime = {'ticker' : #_of_times }
    d_prime = {}
    
    for _,t in group.iterrows():
        if ptr_utils.isvalid(t[constants.TICKER]):
            d_prime = dict_utils.increment_dictionary(d_prime, t[constants.TICKER])
       

    d = dict_utils.sort_dictionary_by_inner_values(d_prime, reverse=True)

    filename = "num_of_tickers_" + type 
    key_header = constants.TICKER
    value_header = "frequency"

    print("Number of tickers in {}: {}".format(type, len(d_prime)))

    dir = dir_utils.makesubdir(constants.path_csv, constants.TICKER)
    wd = csv_utils.make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))
    return d 

_ = transaction_date_wrt_sector(house_input_df, constants.HOUSE)
_ = transaction_date_wrt_sector(senate_input_df, constants.SENATE)
_ = transaction_date_wrt_sector(input_df, constants.INPUT)
    

### Frequency of Ticker per Year

In [None]:
def frequency_of_ticker_breakdown_ticker():
    d = {}


    for _, transaction in input_df.iterrows():
        if isvalid(transaction['ticker']):
            d = increment_dictionary_in_dictionary(d, transaction['ticker'], get_year(transaction['transaction_date']))
            d = increment_dictionary_in_dictionary(d, transaction['ticker'], "Total")

    d = sort_dictionary_by_keys(d)
    
    filename = "trans_per_year_breakdown"
    key_header = "ticker"
    
    
    dir = makesubdir(path_csv, "ticker")
    wd = make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))

frequency_of_ticker_breakdown_ticker()

### Frequency of Ticker per Date

In [None]:
def frequency_of_ticker_by_date():
    d = {}

    
    for _, transaction in rows:
        if isvalid(transaction['ticker']):
            d = increment_dictionary_in_dictionary(d, transaction['ticker'], transaction['transaction_date'])

    filename = "frequency_of_ticker_by_date"
    key_header = "ticker"
    
    dir = makesubdir(path_csv, "ticker")
    wd = make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))

frequency_of_ticker_by_date()

### Industry

#### Number of Transactions per Industry
_Not controlled in any way._

In [None]:
def number_of_transactions_per_indusry():        
    d = {}

    df = get_mapping()
    
    for _, transaction in rows:
        ticker = transaction['ticker']
        industry = search_mapping(df, ticker)
        if industry: 
            d = increment_dictionary(d, industry)

    filename = "number_of_transactions_per_indusry"
    key_header = "industry"
    value_header = "number_of_transactions"

    d = sort_dictionary_by_values(d)
    
    dir = makesubdir(path_csv, "ticker/industry")
    wd = make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))    
    
    return d 

number_of_transactions_per_indusry_res = number_of_transactions_per_indusry()

#### Industry Breakdown per Official

In [None]:
def frequency_of_industry_breakdown_official():
    d = {}

    df = get_mapping()

    for _, transaction in rows:
        industry = search_mapping(df, transaction['ticker'])
        if industry: 
            d = increment_dictionary_in_dictionary(d, get_canonical_name(transaction[title]), industry)

    d = sort_dictionary_by_keys(d)
    
    filename = "frequency_of_industry_per_official"
    dir = makesubdir(path_csv, "ticker/industry")

    wd = make_csv_breakdown(dir, filename, d, title)
    print(pd.read_csv(wd).head(2))

frequency_of_industry_breakdown_official()

#### Frequency of Industry per Year

In [None]:
def frequency_of_industry_breakdown():
    
    d = {}
    df = get_mapping()

    for _, transaction in rows:
        ticker = transaction['ticker']
        
        if isvalid(ticker):
            industry = search_mapping(df, ticker)
            if industry: 
                d = increment_dictionary_in_dictionary(d, industry, get_year(transaction['transaction_date']))

    d = sort_dictionary_by_keys(d)
    
    filename = "frequency_of_industry_breakdown"
    key_header = "industry"
    
    
    dir = makesubdir(path_csv, "ticker/industry")
    wd = make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))

frequency_of_industry_breakdown()

## Asset Description (asset_description) and Comment (comment)

### Number of Scanned PDFs.

In [None]:
def number_of_scanned_pdfs():

    count = 0 

    for _,t in input_df.iterrows():
        if t[constants.ASSET_DESCRIPTION] == constants.DISCLOSED:
            count += 1 
            
    print("Number of scanned PDFS: {}, {}\n".format(count, ptr_utils.make_percent(count, len(input_df.index))))
            
number_of_scanned_pdfs()
    

### Number of Transactions w/asset_description or comment

In [None]:
def freq_of_asset_description():

    count = 0
    
    for _,t in input_df.iterrows():
        if ptr_utils.isvalid(t[constants.ASSET_DESCRIPTION]) or ptr_utils.isvalid(t[constants.COMMENT]):
            count += 1 
    
    print("Number of transactions with asset_description or comment: {}".format(ptr_utils.commify_str(count)))

    print("Number of transactions with asset_description or comment controlled: {} \n".format(ptr_utils.make_percent(count, len(input_df.index))))

freq_of_asset_description()

## Asset Type (asset_type)

### Frequency of Asset Type

In [None]:
def frequency_of_asset_type():
    d = {}


    for _, transaction in rows:
        if isvalid(transaction['asset_type']):
            d = increment_dictionary(d, transaction['asset_type'])
      
    d = sort_dictionary_by_values(d)
  
    filename = "frequency_of_asset_type"
    key_header = "asset_type"
    value_header = "number_of_transactions"
    
    dir = makesubdir(path_csv, "asset_type")
    wd = make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))


frequency_of_asset_type()

## Amount

### Frequency of Amount by Person

In [None]:
def frequency_of_amount_by_persom():
    d = {}


    for _, transaction in rows:
        d = increment_dictionary_in_dictionary(d, get_canonical_name(transaction[title]), transaction['amount'])

    
    d = sort_dictionary_by_keys(d)
    
    filename = "frequency_of_amount_by_persom"
    key_header = "amount"
    
    dir = makesubdir(path_csv, "amount")
    wd = make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))
   
    
frequency_of_amount_by_persom()

### Frequency of Transactions by Amount

In [None]:
def frequency_of_amount_total():
    d = {}


    for _, transaction in rows:
        d = increment_dictionary(d, transaction['amount'])

    d = add_sort_key_for_amount(d, normal_header="num_of_transactions", normal=True)
    d = sort_dictionary_by_sort_key(d)

    filename = "frequency_of_amount_total"
    key_header = "amount"


    dir = makesubdir(path_csv, "amount")
    wd = make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))
   
    
frequency_of_amount_total()

### Frequency of Amount by Gender 

In [None]:
def frequency_of_amount_by_gender():
    d = {}


    gender = {}
    for _, transaction in rows:
        person = transaction[title]
        
        if person not in gender: 
            rep = wiki_search(person) 
            gender.update({person : rep.get_gender()})
            
        d = increment_dictionary_in_dictionary(d, transaction['amount'], gender[person])
        

    d = add_sort_key_for_amount(d)
    d = sort_dictionary_by_sort_key(d)
    
        
    filename = "frequency_of_amount_by_gender"
    key_header = "amount"
    
    
    dir = makesubdir(path_csv, "amount")
    wd = make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))
   
    
frequency_of_amount_by_gender()

### Frequency of Transactions by Political Affiliation and Amount

In [None]:


def frequency_of_amount_by_aff():
    d = {}


    affiliations = {}
    for _, transaction in rows:
        person = transaction[title]
        
        if person not in affiliations: 
            rep = wiki_search(person) 
            affiliations.update({person : rep.party})
            
        d = increment_dictionary_in_dictionary(d, transaction['amount'], affiliations[person])


    d = add_sort_key_for_amount(d)
    d = sort_dictionary_by_sort_key(d)
    
        
    filename = "frequency_of_amount_by_aff"
    key_header = "amount"
    
    
    dir = makesubdir(path_csv, "amount")
    wd = make_csv_breakdown(dir, filename, d, key_header)
    print(pd.read_csv(wd).head(2))
   
    
frequency_of_amount_by_aff()

### Average For Buys and Sells per Official 

In [None]:
def average_per_person():

    d={}
    
    for _, transaction in rows:
        if isvalid(transaction['amount']):
            
            # if 'Purchase' in transaction['type']:
                
            # if 'Sale' in transaction['type']:
            d = increment_dictionary_in_dictionary(d, get_canonical_name(transaction[title]), average_amount(transaction['amount']))


    d2 = {}
    for person in d:
        l = []
        for amount in d[person]:   
            l.append(d[person][amount]*amount)
            
        d2[person] = l 
        
        
    for person in d2:
        d2[person] = int(gmean(d2[person]))
    
    filename = "average_per_person"
    key_header = title 
    value_header = "average_size_of_transactions"

    d2 = sort_dictionary_by_values(d2)
    d2 = commify(d2)

    dir = makesubdir(path_csv, "amount")
    wd = make_csv(dir, filename, d2, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))

    return d2 

average_per_person_res = average_per_person()

## Types (type)

### Types of Actions Total

In [None]:
def frequency_of_act():
    d = {}

    for _, transaction in rows:
        if isvalid(transaction['type']): 
            d = increment_dictionary(d, transaction['type'])
    
    d = sort_dictionary_by_values(d)
    
    filename = "frequency_of_act"
    key_header = "type"
    value_header = "number_of_transactions"
    
    dir = makesubdir(path_csv, "type")
    wd = make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))

     
frequency_of_act()

### Types of Transactions per Person

In [None]:
from utils import increment_dictionary_in_dictionary, sort_dictionary_by_keys, get_data, path_csv,makesubdir
from csv_utils import make_csv_breakdown
import pandas as pd 
from official import get_canonical_name


def types_of_transactions_per_person():

    d={}
    
    for _, transaction in rows:
        d = increment_dictionary_in_dictionary(d, get_canonical_name(transaction[title]), transaction['type'])


    filename = "types_of_transactions_per_person"

    d = sort_dictionary_by_keys(d)

    dir = makesubdir(path_csv, "type")
    wd = make_csv_breakdown(dir, filename, d, title)
    df = pd.read_csv(wd)
    print(df.head(5))

    return d 

types_of_transactions_per_person_res = types_of_transactions_per_person()

    

## Frequency

### Number of Transactions per Year

In [None]:

def num_of_trans_per_year():
    d={}
    
    for _, transaction in rows:
        d = increment_dictionary(d, get_year(transaction['transaction_date']))

    filename = "num_of_trans_per_year"
    key_header = "year"
    value_header = "number_of_transactions"

    d = sort_dictionary_by_values(d)
    
    dir = makesubdir(path_csv, "frequency")
    wd = make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))

    return d 

num_of_trans_per_year_res = num_of_trans_per_year()

### Number of Transactions per Person 
_Not controlling for number of years in position or size of transaction._ For each official, we want their total number of transactions.  {'Sam': 5, 'Alex': 2424, ...}

In [None]:

def num_of_trans_per_person():

    d={}
    
    for _, transaction in rows:
        d = increment_dictionary(d, get_canonical_name(transaction[title]))

    filename = "num_of_trans_per_person"
    key_header = title
    value_header = "number_of_transactions"

    d = sort_dictionary_by_values(d)
    
    dir = makesubdir(path_csv, "frequency")
    wd = make_csv(dir, filename, d, [key_header, value_header])
    df = pd.read_csv(wd)
    print(df.head(5))

    return d 

trans_per_person_res = num_of_trans_per_person()


### Number of Transactions per Person Controlled
_Divide number of transactions by number of years in official position.  Not controlling for size of transaction._

In [None]:
def num_of_trans_per_person_controlled(rows):    

    d={}

    for person, num_of_trans in rows.items():
        official = wiki_search(person)    
        years_in_office = official.get_num_of_years()
        d[person] = math.ceil(num_of_trans/years_in_office) 

    filename = "num_of_trans_per_person_controlled"
    value_header = "avg_number_of_transactions"

    d = sort_dictionary_by_values(d)
   
    dir = makesubdir(path_csv, "frequency")
    wd = make_csv(dir, filename, d, [title, value_header])
    print(pd.read_csv(wd).head(5))
    
    return d 

num_of_trans_per_person_controlled_res = num_of_trans_per_person_controlled(trans_per_person_res)
    

### Number of Transactions per Person Controlled w/Number of Years in Congress

In [None]:
def num_of_trans_per_person_controlled_w_seniority(rows):    

    d={}

    for person, num_of_trans in rows.items():
        official = wiki_search(person)    
        years_in_office = official.get_num_of_years()
        d[person] = math.ceil(num_of_trans/years_in_office), official.get_num_of_years()

    filename = "num_of_trans_per_person_controlled_w_seniority"
    value_header = "avg_number_of_transactions"

    d = sort_dictionary_by_values(d)
   
    dir = makesubdir(path_csv, "frequency")
    wd = make_csv(dir, filename, d, [title, value_header, "seniority"])
    print(pd.read_csv(wd).head(5))
    
    return d 

num_of_trans_per_person_controlled_w_seniority_res = num_of_trans_per_person_controlled_w_seniority(trans_per_person_res)
    

### Number of Transactions per Person Controlled w/Degrees

In [None]:
def num_of_trans_per_person_controlled_w_degrees(rows):    


    d={}

    for person, num_of_trans in rows.items():
        official = wiki_search(person)    
        years_in_office = official.get_num_of_years()
        d[person] = math.ceil(num_of_trans/years_in_office), official.get_num_of_degrees()

    filename = "num_of_trans_per_person_controlled_w_degrees"
    value_header = "avg_number_of_transactions"

    d = sort_dictionary_by_values(d)
   
    dir = makesubdir(path_csv, "frequency")
    wd = make_csv(dir, filename, d, [title, value_header, "num_of_degrees"])
    print(pd.read_csv(wd).head(5))
    
    return d 

num_of_trans_per_person_controlled_w_degrees_res = num_of_trans_per_person_controlled_w_degrees(trans_per_person_res)
    

### Number of Transactions per Person Controlled w/Affiliation 

In [None]:

def num_of_trans_per_person_controlled_w_aff(rows):

    d={}
    
    for person, val in rows.items():
        official = wiki_search(person)     
        d[person] = val, official.get_party()

    filename = "num_of_trans_per_person_controlled_w_aff"
    value_header = "avg_number_of_transactions"

    d = sort_dictionary_by_values(d)
        
    dir = makesubdir(path_csv, "frequency")
    wd = make_csv(dir, filename, d, [title, value_header, "party"])
    print(pd.read_csv(wd).head(5))
    
    return d, pd.read_csv(wd).head(10)

num_of_trans_per_person_controlled_w_aff_res, top_10 = num_of_trans_per_person_controlled_w_aff(num_of_trans_per_person_controlled_res)
    

### Transaction Date (transaction_date)

#### Frequency of Differences between Transaction and Disclosure Date

In [None]:
def frequency_of_differences():
    d = {}
    match = {}
    total = num = 0 

    for _, transaction in rows:
        transaction_date = transaction['transaction_date']
        disclosure_date = transaction['disclosure_date']

        # Negative, X days BEFORE
        # Positive, Y dayas AFTER
        diff = difference_between_dates(disclosure_date, transaction_date)      
        total += 1 
        num += diff   
        
        # match = {5 days: {'Tom' : 1313, 'X': 3 , ...}. ..}
        match = increment_dictionary_in_dictionary(match, diff, transaction[title])
            
            
        d = increment_dictionary(d, int(diff))
    
    d = sort_dictionary_by_keys(d)
        
    filename = "frequency_of_differences"
    key_header = "difference_in_days"
    value_header = "#_of_transactions_with_that_diff"
    
    
    dir = makesubdir(path_csv, "disclosure_date/transaction_date")
    make_csv(dir, filename, d, [key_header, value_header])
    # fig = graph_csv(dir, path_html, filename, key_header, value_header)
    # fig.show()
    
    print("Average difference in days: {}".format(num//total))

frequency_of_differences()

## Commitee and Industry Check

In [None]:
def committee_and_industry():
    
    for _,t in input_df.iterrows():
        industry = search.get_industry(t[constants.TICKER])
        
        name = official.get_name(t)
        link = input_all_officials_name[name]
        _, obj = input_officials_objects[link]
        
        committees = obj.asgts
        
        for comm in committees:
            if industry in comm: 
                print(industry, comm, name)

committee_and_industry

In [None]:
def committee_and_industry():
    # gotta check date :)
    # d = {name : { commitee : set(tickers)}, }
    d = {}
    
    for _,t in input_df.iterrows():
        industry = ptr_utils.get_year((t[constants.TDATE]))
        
        
        industry = search.get_industry(t[constants.TICKER])
        
        name = official.get_name(t)
        link = input_all_officials_name[name]
        _, obj = input_officials_objects[link]
        
        committees = obj.asgts
        
        for comm in committees:
            if industry in comm: 
                d = dict_utils.increment_set_in_inner_dictionary(d, name, comm, t[constants.TICKER])
                break                 

committee_and_industry