# Unique Identifier Analysis

In [None]:
import pypyodbc 
import pandas as pd
import textwrap
import warnings
import os

warnings.filterwarnings('ignore')


In [None]:
SERVER = 'SERVERNAME'
DATABASE = 'databasename'
connectionString = f'DRIVER={{SQL Server Native Client 11.0}};SERVER={SERVER};DATABASE={DATABASE};TRUSTED_CONNECTION=yes'
cnxn = pypyodbc.connect(connectionString)
type = 'view' ## 'table' or 'view'
qualification = DATABASE+'.dbo.'

save_path = 'path\\to\\where\\you\\want\\results\\saved'

In [None]:
def create_query_string(sql_file, encoding='utf-16'):
    with open(sql_file, 'r', encoding=encoding) as f_in:
        lines = f_in.read()
        # remove common leading whitespace from all lines    
        query_string = textwrap.dedent("""{}""".format(lines))
        return query_string

## Discover Primary Keys

Find columns which do not have duplicate values

In [None]:
## TODO: Populate list of tables/views to be assessed with proposed unique id
tables_dict = {
'tablename':'uid'
,'tablename2':'uid'
}


In [None]:
def uniqueness_check(uq_file, tablename):
    unique_cols = []
    ## return max number of duplicate UID values
    data_df = pd.read_sql_query('Select * from '+tablename, cnxn)
    for column in data_df.columns:
        obj_count = data_df.groupby(column).size().max()       
        if obj_count == 1:
            with open(uq_file, 'a') as f:
                f.write(tablename+','+column+'\n')
            unique_cols.append(column)
    return data_df, unique_cols

In [None]:
##LOAD PREVIOUS##
uq_file = save_path+ DATABASE+'_UniqueColumns.csv'
cpk_file = save_path+ DATABASE+'_CompositeKeyTables.txt'
if os.path.isfile(uq_file):
    unique_cols = pd.read_csv(uq_file, names=['tablename','columnname'], sep=',')
else:
    unique_cols = pd.DataFrame(columns=['tablename','columnname'])

if os.path.isfile(cpk_file):
    with open(cpk_file, 'r') as f:
        cpk_list = f.read().splitlines()
else:
    cpk_list = []

## Update with new results
for tablename in tables_dict.keys():
    if tablename not in unique_cols['tablename'].values and tablename not in cpk_list:
        data_df, u_cols = uniqueness_check(uq_file, tablename)
        if len(u_cols) == 0:
            cpk_list.append(tablename)
            with open(cpk_file, 'a') as f:
                f.write('\n'+tablename)


In [None]:

##LOAD UPDATED##
unique_cols = pd.read_csv(uq_file, names=['tablename','columnname'], sep='\t')
with open(cpk_file, 'r') as f:
    cpk_list = f.read().splitlines()

In [None]:
## TODO Pick one UID for each table with unique columns
unique_cols

## Discover composite keys

Discover which tables have duplicate values for candidate unique identifier (UID) field(s)

For those tables, which other fields have unique values for the same UID

For those fields, which vary together

Validate final composite key choice

In [None]:
def ckc_fields(ckc_file, data_df, tablename, idname):
    # ## Find fields with unique values for the same UID
    max_uniques = data_df.groupby(idname).nunique().max()
    non_unique_cols = max_uniques[max_uniques>1].index.tolist()
    # non_unique_cols_string = '\',\''.join(non_unique_cols)
    with open(ckc_file, 'a') as f:
        f.write(tablename+'\t'+str(non_unique_cols)+'\n') #XXX fix this
    return non_unique_cols

def analyze_cpks(cka_file, data_df, tablename, idname, groupcol):
    ## Which composite key candidate fields always match values and which don't 
    col_results_df = pd.DataFrame(columns=['Table','ID1','ID2','Unique Column','Unique Count'])
    for uniquecol in data_df.columns:
        if (groupcol.lower() == uniquecol) or (idname == uniquecol):
            continue
        count = data_df.groupby([idname.lower(), groupcol.lower()])[uniquecol].nunique().max()
        with open(cka_file, 'a') as f:
            f.write(tablename+','+idname+','+groupcol+','+uniquecol+','+str(count)+'\n') #XXX fix this
            col_results_df.loc[len(col_results_df)] = {'Table':tablename,'ID1':idname,'ID2':groupcol,'Unique Column':uniquecol,'Unique Count':count}
    return col_results_df


In [None]:
## Get names and suggested primary key for the tables to be analyzed for composite keys
## aka, those that did not return any viable unique columns
cpk_dict = {key: value for key, value in tables_dict.items() if key in cpk_list}

##LOAD PREVIOUS##
ckc_file = save_path+DATABASE+'_CompositeKeyCandidates.csv'
cka_file = save_path+DATABASE+'_CompositeKeyAnalysis.csv'

if os.path.isfile(ckc_file):
    non_unique_cols_df = pd.read_csv(ckc_file, names=['tablename','non-unique columns'], sep='\t')
else:
    non_unique_cols_df = pd.DataFrame(columns=['tablename','non-unique columns'])
if os.path.isfile(cka_file):
    cpk_results_df = pd.read_csv(cka_file, header=None, names= ['Table', 'ID1','ID2','Unique Column','Unique Count'], sep=',')
else:
    cpk_results_df = pd.DataFrame(columns=['Table', 'ID1','ID2','Unique Column','Unique Count'])

for tablename, idname in cpk_dict.items():
    idname = idname.lower()
    if tablename not in non_unique_cols_df['tablename'].values:
        print('Analyzing '+ tablename + ' for non-unique columns')
        data_df = pd.read_sql_query('Select * from '+tablename, cnxn)
        cols_to_analyze = ckc_fields(ckc_file, data_df, tablename, idname)
        for col in cols_to_analyze:
            result_df = analyze_cpks(cka_file, data_df, tablename, idname, col)


In [None]:

##LOAD Updated##
non_unique_cols_df = pd.read_csv(save_path+DATABASE+'_CompositeKeyCandidates.csv', names=['tablename','non-unique columns'], sep='\t')
cpk_results_df = pd.read_csv(cka_file,names=['Table', 'ID1','ID2','Unique Column','Unique Count'])

In [None]:
non_unique_cols_df

In [None]:
cpk_results_df

In [None]:
double_candidates = pd.DataFrame(cpk_results_df.groupby(['Table','ID1','ID2'])['Unique Count'].max())#.reset_index()


In [None]:
double_candidates

In [None]:
double_cpks = double_candidates[double_candidates['Unique Count']<2]
## TODO: Pick an option for ID2 from each table
double_cpks

In [None]:
## For those that did not produce viable second IDs, analyze for 3 or more part composite keys
multi_candidates = pd.DataFrame(cpk_results_df[~cpk_results_df['Table'].isin(double_cpks.reset_index()['Table'].values.tolist())].groupby(['Table','ID1','ID2'])['Unique Count'].max()).reset_index()
multi_candidates = multi_candidates[multi_candidates['Unique Count']>1]
multi_candidate_fields = cpk_results_df[(cpk_results_df['Table'].isin(multi_candidates['Table'].values.tolist()))&(cpk_results_df['ID2'].isin(multi_candidates['ID2'].values.tolist()))&(cpk_results_df['Unique Count']>1)]



In [None]:
## TODO: select one column from each group of columns which appear to always vary together to be ID2 (ID3, ID4...)
multi_candidate_fields

## Validation

In [None]:
## TODO: Input final choices for composite key
val_cpk_dict = {
    'table1':['id1','id2']
    ,'table2':['id1','id2','id3']
}

## Validate uniqueness
def cpk_validate(tablename, col_list):
    col_string = ', '.join(col_list)
    query_string = 'SELECT max(a.objectCount) from (SELECT count(*) as objectCount FROM '+tablename+' GROUP BY '+col_string+' )a'
    count = pd.read_sql_query(query_string, cnxn).values[0][0]
    if count > 1:
        print('Please examine choice of keys for ', tablename)

for tablename, comp_key_list in val_cpk_dict.items():    
    cpk_validate(tablename, comp_key_list)
