# Unique Identifier Analysis

In [2]:
import pypyodbc 
import pandas as pd
import ast
import glob
import textwrap
import numpy as np
from pathlib import Path
from datetime import date
import warnings
import re
import os
import networkx as nx
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')


In [3]:
SERVER = 'SERVERNAME'
DATABASE = 'DATABASE_NAME'
connectionString = f'DRIVER={{SQL Server Native Client 11.0}};SERVER={SERVER};DATABASE={DATABASE};TRUSTED_CONNECTION=yes'
cnxn = pypyodbc.connect(connectionString)
type = 'view' ## 'table' or 'view'
qualification = DATABASE+'.dbo.'

In [45]:
def create_query_string(sql_file, encoding='utf-16'):
    with open(sql_file, 'r', encoding=encoding) as f_in:
        lines = f_in.read()
        # remove common leading whitespace from all lines    
        query_string = textwrap.dedent("""{}""".format(lines))
        return query_string

## Discover Primary Keys

Find columns which do not have duplicate values

In [21]:
## TODO: Populate list of tables/views to be assessed
table_list = ['table1','table2']

In [13]:
def uniqueness_check(tablename):
    unique_cols = []
    ## return max number of duplicate UID values
    data_df = pd.read_sql_query('Select * from '+tablename, cnxn)
    for column in data_df.columns:
        obj_count = data_df.groupby(column).size().max()        
        if obj_count == 1:
            with open(uq_path, 'a') as f:
                f.write(tablename+'\t'+column+'\n')
            print(column + ' in '+tablename+' is unique')
            unique_cols.append(column)
    return data_df, unique_cols

In [None]:
##LOAD PREVIOUS##
uq_path = DATABASE+'_UniqueColumns.csv'
if os.path.isfile(uq_path):
    unique_cols = pd.read_csv(uq_path, names=['tablename','columnname'], sep='\t')

for tablename in table_list:
    if tablename not in unique_cols['tablename'].values:
        data_df, u_cols = uniqueness_check(tablename)

##LOAD UPDATED##
unique_cols = pd.read_csv(uq_path, names=['tablename','columnname'], sep='\t')

In [None]:
unique_cols

## Discover composite keys

Discover which tables have duplicate values for candidate unique identifier (UID) field(s)

For those tables, which other fields have unique values for the same UID

For those fields, which vary together

Validate final composite key choice

In [20]:
## TODO: Give the names and suggested primary key for the tables to be analyzed for composite keys
## aka, those that did not return any viable unique columns
cpk_dict = {'table1':'id1'
            ,'table2':'id1'
}

In [49]:
def cpk_fields(data_df, tablename, idname):
    # ## Find fields with unique values for the same UID
    max_uniques = data_df.groupby(idname).nunique().max()
    non_unique_cols = max_uniques[max_uniques>1].index.tolist()
    with open(nuc_path, 'a') as f:
        f.write(tablename+'\t'+str(non_unique_cols)+'\n')
    return non_unique_cols

def analyze_cpks(data_df, tablename, idname, groupcol):
    ## Which composite key candidate fields always match values and which don't 
    col_results_df = pd.DataFrame(columns=['Table','ID1','ID2','Unique Column','Unique Count'])
    for uniquecol in data_df.columns:
        if (groupcol.lower() == uniquecol) or (idname == uniquecol):
            continue
        count = data_df.groupby([idname.lower(), groupcol.lower()])[uniquecol].nunique().max()
        with open(cka_path, 'a') as f:
            f.write(tablename+'\t'+idname+'\t'+groupcol+'\t'+uniquecol+'\t'+str(count)+'\n')
            col_results_df.loc[len(col_results_df)] = {'Table':tablename,'ID1':idname,'ID2':groupcol,'Unique Column':uniquecol,'Unique Count':count}
    return col_results_df


In [22]:
##LOAD PREVIOUS##
nuc_path = DATABASE+'_CompositeKeyCandidates.csv'
cka_path = DATABASE+'_CompositeKeyAnalysis.csv'
if os.path.isfile(nuc_path):
    non_unique_cols_df = pd.read_csv(nuc_path, names=['tablename','non-unique columns'], sep='\t')
if os.path.isfile(cka_path):
    cpk_results_df = pd.read_csv(cka_path, header=None, names= ['Table', 'ID1','ID2','Unique Column','Unique Count'], sep='\t')

for tablename, idname in cpk_dict.items():
    idname = idname.lower()
    if tablename not in non_unique_cols_df['tablename'].values:
        cols_to_analyze = cpk_fields(data_df, tablename, idname)
        for col in cols_to_analyze:
            result_df = analyze_cpks(data_df, tablename, idname, col)

##LOAD Updated##
non_unique_cols_df = pd.read_csv(nuc_path, names=['tablename','non-unique columns'], sep='\t')
cpk_results_df = pd.read_csv(cka_path, header=None, names= ['tablename', 'ID1','ID2','Unique Column','Unique Count'], sep='\t')

In [24]:
double_candidates = pd.DataFrame(cpk_results_df.groupby(['tablename','ID1','ID2'])['Unique Count'].max()).reset_index()


In [None]:
double_cpks = double_candidates[double_candidates['Unique Count']<2]
## TODO: Pick one option for ID2 for each table
double_cpks

In [27]:
## For those that did not produce viable second IDs, analyze for 3 or more part composite keys
multi_candidates = pd.DataFrame(cpk_results_df[~cpk_results_df['tablename'].isin(double_cpks['tablename'].values.tolist())].groupby(['tablename','ID1','ID2'])['Unique Count'].max()).reset_index()
multi_candidates = multi_candidates[multi_candidates['Unique Count']>1]
multi_candidate_fields = cpk_results_df[(cpk_results_df['tablename'].isin(multi_candidates['tablename'].values.tolist()))&(cpk_results_df['ID2'].isin(multi_candidates['ID2'].values.tolist()))&(cpk_results_df['Unique Count']>1)]



In [None]:
## TODO: select groups of IDs which appear to always vary together
multi_candidate_fields

### Validation

In [None]:
## TODO: Input final choices for composite key
val_cpk_dict = {
    'table1':['id1','id2']
    ,'table2':['id1','id2','id3']
}
## Validate uniqueness
def cpk_validate(tablename, col_list):
    col_string = ', '.join(col_list)
    query_string = 'SELECT max(a.objectCount) from (SELECT count(*) as objectCount FROM '+tablename+' GROUP BY '+col_string+' )a'
    count = pd.read_sql_query(query_string, cnxn).values[0][0]
    if count > 1:
        print('Please examine choice of keys for ', tablename)

for tablename, comp_key_list in val_cpk_dict.items():    
    cpk_validate(tablename, comp_key_list)
