In [2]:
import pandas as pd
from pprint import pprint as pprint

def read_prefixes(filename):
    """ 
    :param filename: path to a CSV file with columns "ns", "uri"
    :return: 
    """
    return pd.read_csv(filename, header=0, names=["ns", "uri"])

def read_fp_distinct(filename):
    """ 
    :param filename: path to a CSV file with columns "s", "p", "o"
    :return: 
    """
    return pd.read_csv(filename, header=0, names=["s", "p", "o"])

def read_fp_count(filename):
    """ 
    :param filename: path to a CSV file with columns "s", "p", "o", "c"
    :return: 
    """
    return pd.read_csv(filename, header=0, names=["s", "p", "o", "c"])

def replace_ns(triples, ns_dataframe):
    """ 
    :param triples: a dataframe with columns (s,p,o)
    :param ns_dataframe: a dataframe with columns (ns,uri)
    :return: triples replaced with ns
    """
    d = dict(zip(ns_dataframe.uri, ns_dataframe.ns))
    return triples.replace(d, regex=True)

def df_to_set_of_tuples(df, list_of_desired_columns=["s", "p", "o"]):
    """ 
    :param df: the fingerprint dataframe 
    :param list_of_desired_columns: in case it is a fingerprint with counts then this is an option 
            to remove the count column, or select any combination of columns
    :return: set of tuples
ns = read_prefixes("test/prefix.csv")

fp1 = replace_ns(fp1,ns)
fp2 = replace_ns(fp2,ns)

print "Done."
    """
    return set([tuple(line) for line in df[list_of_desired_columns].values.tolist()])

def df_diff(alpha, beta):
    """ 
    provides the set difference between two dataframes
    :param alpha: first dataframe
    :param beta: second dataframe
    :return: (a^b, a - b, b - a)
    """
    a = df_to_set_of_tuples(alpha)
    b = df_to_set_of_tuples(beta)
    return a.intersection(b), a.difference(b), b.difference(a)


fp1 = read_fp_count("test/dataset_figerprint_for_count.rq_eurovoc44.log")
fp2 = read_fp_count("test/dataset_figerprint_for_count.rq_EV45OLD.log")

Done.


The next section defines functions generating parts of latex documents

In [8]:
from pylatex import Package
from pylatex import Section, Subsection, Document
from pylatex.base_classes import Environment
from pylatex.utils import NoEscape


class LandscapeEnvironment(Environment):
    _latex_name = 'landscape'
    packages = [Package('pdflscape')]

def diff_to_latex_section(tex_doc, alpha, alpha_description, beta, beta_description,
                          cols=["Subject", "Predicate", "Object"]):
    """ 
    :param tex_doc: the pylatex document
    :param alpha: the first fingerprint
    :param alpha_description: the first fingerprint description
    :param beta: the second fingerprint
    :param beta_description: the second fingerprint description
    :return: returns a section of latex document with deltas
    """
    cmn_s, adb_s, bda_s = df_diff(alpha, beta)

    cmn_df = pd.DataFrame(list(cmn_s), columns=cols, )
    cmn_df.sort_values(by=cols, inplace=True)

    adb_df = pd.DataFrame(list(adb_s), columns=cols, )
    adb_df.sort_values(by=cols, inplace=True)

    bda_df = pd.DataFrame(list(bda_s), columns=cols, )
    bda_df.sort_values(by=cols, inplace=True)

    ref_alpha = alpha_description["title"]
    ref_beta = beta_description["title"]

    section_title = 'Difference between ' + ref_alpha + ' and ' + ref_beta
    with tex_doc.create(LandscapeEnvironment()):
        with tex_doc.create(Section(section_title)):
            # tex_doc.append(alpha_description)
            # tex_doc.append(beta_description)
            with tex_doc.create(Subsection("Common parts")) as subsec:
                subsec.append("The table below represents the elements common to both datasets.")
                # with tex_doc.create(Table(position='H')) as tbl:
                tex_doc.append(NoEscape(cmn_df.to_latex(longtable=True, index=False)))

            with tex_doc.create(Subsection("Unique to " + ref_alpha)) as subsec:
                subsec.append(
                    "The table below represents the elements present in " + ref_alpha + " but missing in " + ref_beta + ".")
                # with tex_doc.create(Table(position='H')) as tbl:
                tex_doc.append(NoEscape(adb_df.to_latex(longtable=True, index=False)))

            with tex_doc.create(Subsection("Unique to " + ref_beta)) as subsec:
                subsec.append(
                    "The table below represents the elements present in " + ref_beta + " but missing in " + ref_alpha + ".")
                # with tex_doc.create(Table(position='H')) as tbl:
                tex_doc.append(NoEscape(bda_df.to_latex(longtable=True, index=False)))

                # diff_to_latex_table(fp1, "Bla Bla", fp2, "Blu blu")


configuration_dict = {
    "type": "difference between two dataset fingerprints",
    "alpha": {"title": "EuroVoc 4.4",
              "filename": "test/dataset_figerprint_for_count.rq_eurovoc44.log",
              "desc": "EuroVoc 4.4 was released a long time ago using EuroVoc Ontology"},
    "beta": {"title": "EuroVoc 4.5",
             "filename": "test/dataset_figerprint_for_count.rq_EV45OLD.log",
             "desc": "EuroVoc 4.5 was released in July 2016 with SKOS-AP-EU and then converted to fit also the old EuroVoc Ontology."},
}

def generate_document(filename, config=configuration_dict):
    """ 
    :param filename: filename for the tex document
    :param config: 
    :return: None
    """
    geometry_options = {
        "head": "40pt",
        "margin": "0.5in",
        "bottom": "0.6in",
        "includeheadfoot": True
    }
    doc = Document('basic', geometry_options=geometry_options)
    
    doc.packages.append(Package('longtable'))
    doc.packages.append(Package('booktabs'))
    doc.packages.append(Package('float'))
    doc.packages.append(Package('ltablex'))
    doc.packages.append(Package('pdflscape'))
    
    first_page = PageStyle("firstpage")
    
    
    # Add document title
    with first_page.create(Head("R")) as right_header:
        with right_header.create(MiniPage(width=NoEscape(r"0.49\textwidth"),
                                 pos='c', align='r')) as title_wrapper:
            title_wrapper.append(LargeText(bold("Bank Account Statement")))
            title_wrapper.append(LineBreak())
            title_wrapper.append(MediumText(bold("Date")))

    # Add footer
    with first_page.create(Foot("C")) as footer:
        message = "Important message please read"
        with footer.create(Tabularx(
                "X X X X",
                width_argument=NoEscape(r"\textwidth"))) as footer_table:

            footer_table.add_row(
                [MultiColumn(4, align='l', data=TextColor("blue", message))])
            footer_table.add_hline(color="blue")
            footer_table.add_empty_row()

            branch_address = MiniPage(
                width=NoEscape(r"0.25\textwidth"),
                pos='t')
            branch_address.append("960 - 22nd street east")
            branch_address.append("\n")
            branch_address.append("Saskatoon, SK")

            document_details = MiniPage(width=NoEscape(r"0.25\textwidth"),
                                        pos='t', align='r')
            document_details.append("1000")
            document_details.append(LineBreak())
            document_details.append(simple_page_number())

            footer_table.add_row([branch_address, branch_address,
                                  branch_address, document_details])

    doc.preamble.append(first_page)
    # End first page style
    


    diff_to_latex_section(doc, fp1, config["alpha"], fp2, config["beta"])
    doc.generate_tex(filepath=filename)
    doc.generate_pdf(clean_tex=False, filepath=filename)

generate_document("temp/diff_report", configuration_dict)

In [54]:
from df_io import read_prefixes, read_fp_spo_count, replace_ns

ns = read_prefixes("test/prefix.csv")

df = replace_ns(read_fp_spo_count("test/test_fingerprint_spo.csv"), ns)

df = df[df['p'] == 'rdf:type'][['stype', 'scnt']].sort_values(by='scnt', ascending=False)

total_scnt = df['scnt'].sum()
df['rel_stype_scnt'] = df['scnt'] / total_scnt * 100

print df['rel_stype_scnt'].sum()
df.rename(columns={'stype': 'Class', 'scnt': 'Unique instances', 'rel_stype_scnt': '% from scaled total'}, inplace=True)

df

100.0


Unnamed: 0,Class,Unique instances,% from scaled total
199,skoxl:Label,41099,80.709713
48,euvoc:XlNotation,3244,6.370527
3,ato:MappedCode,1340,2.631476
151,skos:Concept,1176,2.309414
82,dct:Agent,1139,2.236754
231,org:Organisation,1139,2.236754
290,foaf:Agent,1139,2.236754
52,euvoc:XlNote,618,1.213621
32,euvoc:LabelType,11,0.021602
14,euvoc:ConceptStatus,9,0.017674


In [5]:
from df_desc_stats import *
from df_io import read_prefixes, read_fp_spo_count, replace_ns
import re

ns = read_prefixes("resources/prefix.csv")

df = replace_ns(read_fp_spo_count("resources/test_fingerprint_spo.csv"), ns)

"""
 for each group of ['stype','p'] aim at reducing duplicates to one record bu following method.
 If the 'propType' is 'data' then sum the 'scnt', 'ocnt', 'cnt' and average the 'min_sp', 
 'max_sp' and 'avg_sp'.  If the 'propType' is 'object' then expect the numbers to be exactly the 
 same on every row and then just copy the values of the first row.
"""

df_reduced = pd.DataFrame(columns=df.columns)

# reducing the groups to single row groups
# iterate over the groups of ['stype', 'p']
for name, group in df.groupby(['stype', 'p']):
    if len(group) > 1:
        if len(group[group['propType'] == 'data']['propType']) > 0:
            # process data rows
            aggregated = {'stype': name[0], 'p': name[1], 'propType': 'data', 'ootype': '*'}
            aggregated['scnt'] = [group['scnt'].sum()]
            aggregated['ocnt'] = [group['ocnt'].sum()]
            aggregated['cnt'] = [group['cnt'].sum()]
            aggregated['min_sp'] = [group['min_sp'].min()]
            aggregated['max_sp'] = [group['max_sp'].max()]
            aggregated['avg_sp'] = [group['avg_sp'].mean()]
            aggregated['ootype'] = ", ".join(group['ootype'])
            df_reduced = df_reduced.append(pd.DataFrame(aggregated, columns=df.columns))
        elif len(group[group['propType'] == 'object']['propType']) > 0:
            # process object rows
            g = group[0:1]
            g['ootype'].iloc[0] = ", ".join(group['ootype'])
            df_reduced = df_reduced.append(g)
        else:
            # serios offense this palce should never be reached
            raise Exception("Is the query distinguishing more then two types of properties?")
    else:
        # simple group of single row
        df_reduced = df_reduced.append(group)

# Next step
df_class_st = df_class_stats(df)
df_stats = None

card = lambda x: str(x['min_sp']) + " .. " + str(x['max_sp']) + "(" + str(x['avg_sp']) + ")"
# calculating the averages and relatives per class for each property
# iterate over the groups of ['stype']
for name, group in df.groupby(['stype']):
    type_scnt = group[group['p'] == 'rdf:type']['scnt'].iloc[0]
    type_cnt = group[group['p'] == 'rdf:type']['cnt'].iloc[0]
    group["scnt/type-scnt"] = group['scnt'] / type_scnt * 100
    group["cnt/type-cnt"] = group['cnt'] / type_cnt * 100
    group["caard"] = group['min_sp'].astype(str) + " .. " + group['max_sp'].round().astype(str) + "(" + group[
        'avg_sp'].astype(int).astype(str) + ")"
    if df_stats is None:
        df_stats = pd.DataFrame(columns=group.columns)
    df_stats = df_stats.append(group)


df_stats['min_ap'] = df_stats[ df_stats['scnt/type-scnt'] > 80 ]['min_sp'].astype(int).astype(str)
df_stats['min_ap'].fillna(0, inplace=True)

df_stats['max_ap'] = df_stats[ (df_stats['scnt/type-scnt'] > 80) & (df_stats['max_sp'] ==1)]['max_sp'].astype(int).astype(str)
df_stats['max_ap'].fillna("*", inplace=True)

df_stats[['stype','p','scnt','cnt','min_ap','max_ap']]


def url_local_split(url_string):
    """
    splits an url into base and local strings where the local is the lat segment after a # or /
    :param url_string:
    :return: a tuple of base and local string, if the delimiter char is not found returns (None,url_string) meaning
    that the whole string is a local segment
    """
    l = [i.start() for i in re.finditer("[/#]", url_string)]
    if l:
        return url_string[:l[-1]+1], url_string[l[-1] + 1:]
    return "", url_string


def generate_missing_ns(df, structural_columns=['stype', 'p', 'ootype']):
    """
    given a dataframe detect unique namespaces
    :param df:
    :return:
    """
    unique_uris = set()
    for col in structural_columns:
        uris = set(url_local_split(i)[0] for i in df[col] if url_local_split(i)[0])
        unique_uris = unique_uris.union(uris)
        
    print unique_uris

generate_missing_ns(df_stats)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


TypeError: expected string or buffer

('werwefds/#', '4333')