In [1]:
import os
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

data_folder_path =  "../../src/data/datasets/cckm_3700/v3/raw/"

def get_tbls_length(html_content):
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'lxml')
    # Find all tables in the HTML
    tables = soup.find_all('table')
    table_wtage_len = [0]
    table_wotage_len = [0]

    for table in tables:
        table_wotage_len.append(len(table.get_text()))
        table_wtage_len.append(len(str(table)))
    return table_wtage_len, table_wotage_len, len(str(soup)), len(str(soup.get_text()))

In [2]:
all_data_info = []
for filename in os.listdir(data_folder_path):
    file_path = os.path.join(data_folder_path, filename)
    with open(file_path) as f:
        data = json.load(f)
    table_wtage_len, table_wotage_len, article_wtag_len, article_notag_len = get_tbls_length(data['BW_Article_Details__c'])
    all_data_info.append([filename, data['ArticleNumber'], data['Title'], data['UrlName']
                          , data['BW_TXNMY_GRP_1'], data['BW_TXNMY_GRP_2'], data['BW_TXNMY_GRP_3']
                           , article_wtag_len, article_notag_len, table_wtage_len, table_wotage_len, len(table_wotage_len)])


  soup = BeautifulSoup(html_content, 'lxml')


In [3]:
df = pd.DataFrame(all_data_info, columns=['filename', 'ArticleNumber', 'Title', 'UrlName',
                                           'BW_TXNMY_GRP_1', 'BW_TXNMY_GRP_2', 'BW_TXNMY_GRP_3', 'article_wtag_len', 'article_notag_len',
                                             'table_wtage_len', 'table_wotage_len', '#tables'])
df.to_csv('./3700ccmk.csv')
df.head(2)
df_copy = df.copy()

#### The distributions for each for the groups over the full list of articles:

In [4]:
df['g1_2'] = df['BW_TXNMY_GRP_1'] + '_' + df['BW_TXNMY_GRP_2']
for col in ["BW_TXNMY_GRP_1", "g1_2", "BW_TXNMY_GRP_2", "BW_TXNMY_GRP_3"]:
    fig = px.histogram(df, x=col, nbins=10, histnorm='percent')
    fig.show()

In [5]:
print ('Count of G2:', len(df['BW_TXNMY_GRP_2'].unique()))
fig = px.histogram(df, x='BW_TXNMY_GRP_2',color='BW_TXNMY_GRP_1', nbins=10, histnorm='percent')
fig.show()

Count of G2: 34


	• 8 GPR_1. Good coverage on majority except Decision_Flow
	• 34 GPR_2 groups. 
		○ GPR_1 has a different range of GPR_2 (starting with 2 up to 7).
		○ Not well distributed. We have 1/2/3 dominated GPR_2 in each of the GPR_1.

#### The distributions of the table counts (overall and across groups):

In [6]:
col = '#tables'
fig = px.histogram(df, x=col, nbins=50, histnorm='percent')
mean = df['#tables'].mean()
percentile_90 = np.percentile(df[col], 90)

# Add vertical lines for mean and percentile
fig.add_trace(go.Scatter(x=[mean, mean], y=[0, 1], mode='lines', name='mean'))
fig.add_trace(go.Scatter(x=[percentile_90, percentile_90], y=[0, 1], mode='lines', name='90th percentile'))
fig.show()

fig = px.histogram(df, x=col, color = 'BW_TXNMY_GRP_1', nbins=40, histnorm='percent')
fig.show()


    ○ 60% < 6 tables 
    ○ Equipment & Offers have many articles with > 14 tables (maybe a lot of nested tables, we need to validate if we have a lot of failure retrieval here)

#### The distributions of the article length w/o tags (overall and across groups):

In [7]:
for col in ['article_wtag_len', 'article_notag_len']:
    fig = px.histogram(df, x=col, nbins=50, histnorm='percent')
    # Calculate mean and percentile
    mean = df[col].mean()
    percentile_90 = np.percentile(df[col], 90)

    # Add vertical lines for mean and percentile
    fig.add_trace(go.Scatter(x=[mean, mean], y=[0, 1], mode='lines', name='mean'))
    fig.add_trace(go.Scatter(x=[percentile_90, percentile_90], y=[0, 1], mode='lines', name='90th percentile'))
    fig.show()

    fig = px.histogram(df, x=col, color = 'BW_TXNMY_GRP_1', nbins=50, histnorm='percent')
    fig.show()

    ○ ~25% < 2k (can fit in 2 chunks with 1500 chars)
    ○ ~50% < 4k
    ○ ~67% < 6k
    ○ Per G1:
        § Few articles from Equipment are really long (50k, 77k, 83k) - is the failure Qs from the very long articles??
        § Support/Systems  ~75% < 4k. Longest is ~50k (maybe just one/two articles.).
        § Operations/ Bill & Account / Offers ~47% < 4k. Longest is ~50k (maybe just one/two articles.).
        § Services: ~60%<10k but many articles much longer (up to 40k)
        § Equipment very wide range (up to 83k) majority < 10k.

#### The distributions of the article length w/o tags (overall and across groups), focusing on outlier (>90%):

In [8]:
for section in [True, False]:
    if section:
        print ("Majority:")
    else:
        print ("last 10%")
    for col in ['article_wtag_len', 'article_notag_len']:
        tmp_df = df.explode(col).copy()
        percentile_90 = np.percentile(tmp_df[col], 90)
        if section:
            tmp_df = tmp_df[tmp_df[col] < percentile_90 ]
        else:
            tmp_df = tmp_df[tmp_df[col] >= percentile_90 ]
        fig = px.histogram(tmp_df, x=col, color = 'BW_TXNMY_GRP_1', nbins=40, histnorm='percent')
        # fig.update_layout(barmode='group')

        # Calculate mean and percentile
        mean = tmp_df[col].mean()
        

        # Add vertical lines for mean and percentile
        fig.add_trace(go.Scatter(x=[mean, mean], y=[0, 1], mode='lines', name='mean'))
        # fig.add_trace(go.Scatter(x=[percentile_90, percentile_90], y=[0, 1], mode='lines', name='90th percentile'))

        fig.show()

Majority:


last 10%


#### The distributions of the tables' length w/o tags (overall and across groups):

In [9]:
for col in ['table_wotage_len', 'table_wtage_len']:
    tmp_df = df.explode(col).copy()
    fig = px.histogram(tmp_df, x=col, nbins=80, histnorm='percent')
    # Calculate mean and percentile
    mean = tmp_df[col].mean()
    percentile_90 = np.percentile(tmp_df[col], 90)

    # Add vertical lines for mean and percentile
    fig.add_trace(go.Scatter(x=[mean, mean], y=[0, 1], mode='lines', name='mean'))
    fig.add_trace(go.Scatter(x=[percentile_90, percentile_90], y=[0, 1], mode='lines', name='90th percentile'))

    fig.show()

    fig = px.histogram(tmp_df, x=col, color = 'BW_TXNMY_GRP_1', nbins=80, histnorm='percent')
    fig.show()


	• Table Length (plain text - no tags):
		○ ~78% of the tables <=1k (one chunk).
		○ ~91% of the tables <=2k.
        ○ For each of the G1, there is very few outliers (1/2/3 very long tables between 10k to 50k length). 

In [11]:
for section in [True, False]:
    if section:
        print ("Majority:")
    else:
        print ("last 10%")
    for col in ['table_wotage_len', 'table_wtage_len']:
        tmp_df = df.explode(col).copy()
        percentile_90 = np.percentile(tmp_df[col], 90)
        if section:
            tmp_df = tmp_df[tmp_df[col] < percentile_90 ]
        else:
            tmp_df = tmp_df[tmp_df[col] >= percentile_90 ]
        fig = px.histogram(tmp_df, x=col, color = 'BW_TXNMY_GRP_1', nbins=40, histnorm='percent')
        # fig.update_layout(barmode='group')

        # Calculate mean and percentile
        mean = tmp_df[col].mean()
        

        # Add vertical lines for mean and percentile
        fig.add_trace(go.Scatter(x=[mean, mean], y=[0, 1], mode='lines', name='mean'))
        # fig.add_trace(go.Scatter(x=[percentile_90, percentile_90], y=[0, 1], mode='lines', name='90th percentile'))

        fig.show()

Majority:


last 10%
