In [41]:
!pip install domain-connect humanize validators pandas tabulate --exists-action i

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [3]:
import dc_scanner_test as scanner
import pandas as pd

In [32]:
dump_files = [
    {'date': '2024-02', 'file': './data/2024-02_last_dump_full_6700000.pckl'},
    {'date': '2023-06', 'file': './data/2023-07_save_final_dump_full_159000000.pckl'},
    {'date': '2022-04', 'file': './data/2022-04_with_templates_dump_full_9700000.pckl'},
    {'date': '2021-06', 'file': './data/2021-06_results_82500000.pckl'}, 
    {'date': '2019-03', 'file': './data/2019-03_result_138400000.pckl'},
    {'date': '2018-08', 'file': './data/2018-08_result_94900000.pckl'},
]

In [33]:
maps = []
for file in dump_files:
    scanner.load_api_providers(file['file'])
    maps += [
        {
            'date': file['date'],
            'api_url_map': scanner.api_url_map
        }
    ]
    scanner.api_url_map = None

In [34]:
for entry in maps:
    for api, item in entry['api_url_map'].items():
        temp_new = {}
        if hasattr(entry['api_url_map'][api], 'supported_templates'):
            for templ in entry['api_url_map'][api].supported_templates:
                temp_new[templ[0]] = {
                    **temp_new.get(templ[0], {}),
                    **{
                        templ[1]: True
                    }
                }
        entry['api_url_map'][api].supported_templates = temp_new
        entry['api_url_map'][api].nslist = {}

In [35]:
def flatten_object(obj):
    res = {}
    flatten_object_int(obj, res)
    return res

def flatten_object_int(obj, flat_dict, prefix=''):
    # Iterate through each attribute in the object
    for attribute in dir(obj):
        # Avoid magic methods and internal attributes
        if attribute.startswith('__') and attribute.endswith('__'):
            continue
        # Get attribute value
        attr_value = getattr(obj, attribute)
        #print(f'Flattening {attribute}')
        flatten_value(attr_value, flat_dict, prefix + '.' + attribute.lower() if prefix != '' else attribute.lower())
    return flat_dict

def flatten_value(attr_value, result_dict, prefix=''):
    #print(f'Flattening value {attr_value} {type(attr_value)}')
    # Check if the attribute is an instance of a basic data type or another object
    if isinstance(attr_value, (int, float, str, bool)):
         result_dict[prefix] = attr_value
    elif isinstance(attr_value, (list, tuple)):
        for i, item in enumerate(attr_value):
            flatten_value(item, result_dict, prefix + '[' + str(i) + ']')
    elif isinstance(attr_value, dict):
        for k, v in attr_value.items():
            flatten_value(v, result_dict, prefix + '.' + k.lower())
    elif hasattr(attr_value, '__dict__'):  # This checks if it's a more complex object
        #print(f'Entering dict path for {attr_value}')
        flatten_object_int(attr_value, result_dict, prefix)
    else:
        result_dict[prefix] = str(attr_value)  # Convert to string if unsure


In [36]:
# Flatten the data
rows = [
    {
        **{
            'date': item['date'],
            'api': api
        },
        **flatten_object(obj)
    }
    for item in maps
    for api, obj in item['api_url_map'].items()
]
len(rows)

29717

In [37]:
df = pd.DataFrame(rows)
df

Unnamed: 0,date,api,api_url,cnt,config.providername,config.domain,config.domain_root,config.host,config.providerdisplayname,config.providerid,...,supported_templates.getreadydigital.com.site-a-record,supported_templates.getreadydigital.com.text-dynamic-record,supported_templates.getreadydigital.com.site-a-txt-ssl-record,supported_templates.retroliste.com.domain-verification,supported_templates.mailchimp.com.email-signing,supported_templates.brevo.com.domain-authentication,supported_templates.maileon.com.hosting_and_email_ionos,supported_templates.owner.com.onboarding,supported_templates.megapowerani.com.domain.verification.template,supported_templates.nicesite.so.hosting
0,2024-02,None: NoAnswer,None: NoAnswer,1976484,,dummy.local,dummy.local,,,,...,,,,,,,,,,
1,2024-02,None: No valid URL in answers,None: No valid URL in answers,254693,,dummy.local,dummy.local,,,,...,,,,,,,,,,
2,2024-02,None: NXDOMAIN or YXDOMAIN,None: NXDOMAIN or YXDOMAIN,1947969,,dummy.local,dummy.local,,,,...,,,,,,,,,,
3,2024-02,https://api.cloudflare.com/dns/domainconnect,https://api.cloudflare.com/dns/domainconnect,597763,cloudflare,0-0job.com,0-0job.com,,Cloudflare,cloudflare.com,...,,,,,,,,,,
4,2024-02,https://domainconnect.api.godaddy.com,https://domainconnect.api.godaddy.com,1222205,GoDaddy,0--0------------------------------------------...,0--0------------------------------------------...,,GoDaddy,godaddy.com,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29712,2018-08,https://domainconnect.plesk.com/host/websitesi...,https://domainconnect.plesk.com/host/websitesi...,39,Plesk,uguricbak.com,uguricbak.com,,,plesk.com,...,,,,,,,,,,
29713,2018-08,https://domainconnect.plesk.com/host/plesk-v01...,https://domainconnect.plesk.com/host/plesk-v01...,1,Plesk,abbicuradite.com,abbicuradite.com,,,plesk.com,...,,,,,,,,,,
29714,2018-08,https://domainconnect.plesk.com/host/dns19679....,https://domainconnect.plesk.com/host/dns19679....,1,Plesk,scilevantechapter.com,scilevantechapter.com,,,plesk.com,...,,,,,,,,,,
29715,2018-08,https://dc.ispgateway.de,https://dc.ispgateway.de,148152,DomainFactory,mittwede.com,mittwede.com,,,,...,,,,,,,,,,


In [45]:
# Group by 'date' and 'config.providerName' and sum 'cnt'
grouped_df = df.groupby(['date', 'config.providername'])['cnt'].sum().reset_index()

# Calculate the total 'cnt' for each date
grouped_df['total_cnt_by_date'] = grouped_df.groupby('date')['cnt'].transform('sum')

# Calculate the percentage for each group within each date
grouped_df['percentage'] = (grouped_df['cnt'] / grouped_df['total_cnt_by_date']) * 100

# Round the percentage values
grouped_df['percentage'] = grouped_df['percentage'].round(1)

# Optionally reshape the data to have percentages in separate columns per date
pivot_df = grouped_df.pivot(index='config.providername', columns='date', values='percentage')

# Sort the pivoted DataFrame by '2023-06' in descending order
if '2023-06' in pivot_df.columns:
    pivot_df = pivot_df.sort_values(by='2023-06', ascending=False)
else:
    print("Column '2023-06' does not exist in the DataFrame.")

# Display the pivoted DataFrame
print(pivot_df.reset_index().to_string(na_rep=""))

date             config.providername  2018-08  2019-03  2021-06  2022-04  2023-06  2024-02
0                               None     73.7     73.5     66.2     68.4     65.4     67.5
1                            GoDaddy     22.2     22.8     23.1     20.3     22.0     18.2
2                         cloudflare                        3.5      5.5      4.8      8.9
3                     Google Domains                        2.4      2.1      3.5      2.2
4                              IONOS                                 1.4      1.7      1.3
5                      Secure Server      1.2      1.1      0.9      0.8      0.8      0.6
6                              iPage                                 0.6      0.7      0.2
7                      WordPress.com      0.0      0.1      0.4      0.3      0.4      0.2
8                            123 Reg      0.0      0.0      0.3      0.2      0.3      0.2
9                         hosteurope      0.1      0.1      0.1      0.0      0.1      0.0