## Import libraries and get username/password

In [None]:
import mysql.connector
from mysql.connector import errorcode
import pandas as pd
import difflib
from difflib import SequenceMatcher as SM

#Reads in your input for user name and password
myuser = input("User Name: ")
mypassw = input("Password: ")

## Connect to the database

In [None]:
#This tries to connect to the database
try:
    cnx = mysql.connector.connect(user = myuser, password = mypassw,
                              host = '127.0.0.1',
                              database = 'datacleandb')
    print("connected")
except mysql.connector.Error as err:
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
    print("Something is wrong with your user name or password")
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
    print("Database does not exist")
  else:
    print(err)

## Create lookup and mistake tables

In [None]:
#Create the lookup Table
cursor = cnx.cursor()
sqlcommand = """CREATE TABLE IF NOT EXISTS mydata_lookup(
                Valid_Sub_Vendor VARCHAR(255) PRIMARY KEY,
                is_Supplier_Name BOOL)"""
cursor.execute(sqlcommand, multi = False)
#Creates the mistake table, so that we can compare to common mistakes
sqlcommand = """CREATE TABLE IF NOT EXISTS mydata_lookup_mistakes(
                Correct_Vendor VARCHAR(255),
                Mistake_Vendor VARCHAR(255),
                FOREIGN KEY(Correct_Vendor) REFERENCES mydata_lookup(Valid_Sub_Vendor),
                PRIMARY KEY(Correct_Vendor, Mistake_Vendor))"""
cursor.execute(sqlcommand, multi = False)
cnx.commit()

## TEMP: add values to Lookup and Mistake tables

In [None]:
#---DO NOT RUN MULTIPLE TIMES ---
#TEMP ADDING TO THE LOOKUP TABLE
sqlcommand = """INSERT INTO mydata_lookup 
                VALUES ('Stars Prime', FALSE)"""
cursor.execute(sqlcommand, multi = False)
cnx.commit()

In [None]:
#---DO NOT RUN MULTIPLE TIMES---
#TEMP ADDING TO THE MISTAKE TABLE
sqlcommand = """INSERT INTO mydata_lookup_mistakes 
                VALUES ('Claridge', 'Claridges')"""
cursor.execute(sqlcommand, multi = False)
cnx.commit()

# Start of Program Code

## import lookup and mistake from MySQL

In [None]:
#Function to import the lookup table
def importLookupTable(connection):
    sqlcommand = 'SELECT * FROM mydata_lookup'
    lookup_table = pd.DataFrame(columns = ['Valid_Sub_Vendor','is_Supplier_Name'])
    try:
        lookup_table = pd.read_sql(sqlcommand, con = connection)
    except:
        print("mydata_lookup table is empty")
    return lookup_table

#Function to import the mistake table
def importMistakeTable(connection):
    sqlcommand = 'SELECT * FROM mydata_lookup_mistakes'
    mistake_table = pd.DataFrame(columns = ['Correct_Vendor','Mistake_Vendor'])
    try:
        mistake_table = pd.read_sql(sqlcommand, con = connection)
    except:
        print("mydata_lookup_mistake table is empty")
    return mistake_table

## Finds % similarity in lookup

In [None]:
#Name matching in lookup table
def nameMatchingLookup(val, lookup_df):
    #Creates the empty dataframe of ratios
    ratio_df = pd.DataFrame(columns = ['Ratio','LookupVal','MistakeVal'])
    #Makes the value to compare lowercase
    val = val.lower()

    #Iterates through all of the rows in the lookup table
    for index, row in lookup_df.iterrows():
        #Runs the sequence matcher
        sub = SM( a = val, b = row['Valid_Sub_Vendor'].lower() )
        #Saves the ratio and name into the dataframe
        ratio_df.loc[index] = [round(sub.ratio(), 3), row['Valid_Sub_Vendor'], None]

    #Returns the ratio dataframe sorted
    return ratio_df.sort_values(by = ['Ratio'], ascending = False)

## Finds % similarity in mistake

In [None]:
#Name matching in mistake table
def nameMatchingMistake(val, mistake_df):
    #Creates the empty dataframe of ratios
    ratio_df = pd.DataFrame(columns = ['Ratio','LookupVal','MistakeVal'])
    #Makes the value to compare lowercase
    val = val.lower()

    #Iterates through all of the rows in the mistake table
    for index, row in mistake_df.iterrows():
        #Runs the sequence matcher
        sub = SM( a = val, b = row['Mistake_Vendor'].lower() )
        #Saves the ratio and name into the dataframe
        ratio_df.loc[index] = [round(sub.ratio(), 3), row['Correct_Vendor'], row['Mistake_Vendor']]

    #Returns the ratio dataframe sorted
    return ratio_df.sort_values(by = ['Ratio'], ascending = False)

## Merges lookup and mistake %'s

In [None]:
#Gets all the ratio's, combines and sorts
def nameMatching(rowVal, lookup_df, mistake_df):
    #Compare the value to the ones in the tables
    ratio_l_df = nameMatchingLookup(rowVal, lookup_df)
    ratio_m_df = nameMatchingMistake(rowVal, mistake_df)
    
    #Merge the two tables together, remove 'duplicates' keep max
    ratio_df = pd.concat([ratio_l_df, ratio_m_df], ignore_index = True)
    ratio_df = ratio_df.loc[ratio_df.reset_index().groupby(['LookupVal'])['Ratio'].idxmax()]
    ratio_df = ratio_df.sort_values(by = ['Ratio'], ascending = False)
    return ratio_df.reset_index(drop = True)

##### Checks to see if a value is a number

In [None]:
#Checks if it is a number
def isNumber(s):
    try:
        int(s)
        return True
    except ValueError:
        return False 

## Makes the User Interface

In [None]:
#Fake GUI - NOTE: customValue() also uses this. BEWARE OF CHANGES 
def fakeGUI(ratio, name):
    topindex = ratio['Ratio'].count() - 1
    show = 10
    x = 0
    print("++++c: custom, <num>: the row, else: see next row++++")
    print("Current: ", name)
    while True:
        #Makes sure to bound our index
        top = x + show - 1
        if top > topindex:
            top = topindex
            
        #Print the current rows in the result
        l = [i for i in range(x,top+1)]
        print(ratio.iloc[l])

        #Loops until we get a valid user entry
        while True:
            #Get the user input
            val = input("Which (if any) is correct: ")
            
            #Check to see if the input is valid numerically
            if isNumber(val):
                #Value is one of the ones we displayed
                if int(val) >= x and int(val) <= top:
                    break
            elif val == 'c':
                val = -2 #Temp value to indicate we want a custom value
                break
            else:
                val = -1 #Temp value to indicate we continue with next ones
                break
        #END LOOP
        
        #Returns the value if it is legit
        #With a value of -1, we know that we just continue
        #with the next ratio values
        if val != -1:
            return val
        
        #Increment the x index
        x = top + show
        if x > topindex:
            #Houston, we have a problem
            x = 0
    #END LOOP

## Adds mistake to local and MySQL

In [None]:
#Insert the new mistake into the mistake table
def addMistake(mistake, actual, df):
    #Add the mistake to the server
    print("TODO: add mistake to server")

    #Adds the mistake locally then returns it
    return df.append({'Correct_Vendor' : actual, 'Mistake_Vendor' : mistake}, ignore_index = True)

## Adds lookup value to local and MySQL

In [None]:
#Insert the new value into the lookup table
def addLookup(val, df, isSupplier):
    #Add the mistake to the server
    print("TODO: add lookup value to server")

    sup = 0
    if isSupplier:
        sup = 1
    
    #Adds the lookup locally then returns it
    return df.append({'Valid_Sub_Vendor' : val, 'is_Supplier_Name' : sup}, ignore_index = True)

## Run when a user wants to use a custom value

In [None]:
def customValue(l_df, m_df, name):
    keep = 'y'
    cust = ''
    #0: using custom, 1: using prev & update mistake table, 2: using prev
    vect = [0, "lookup", "mistake"] #["type", "lookup", "mistake"]
    while True:
        #Gets the custom value from the user
        cust = input("Custom Sub_Vendor: ")
        keep = input("Keep value (y/n): ")
        if keep == 'y':
            break
    
    #CHECK TO SEE IF THE USER WANTS A ALREADY MADE ONE----------
    ratio_df = nameMatching(cust, l_df, m_df)
    
    print("-----------------------------------------------------")
    #GUI: Do something with the information you have received
    result = int(fakeGUI(ratio_df, cust))
    
    if result == -2:
        #Use the custom typed one
        vect = [0, cust, name]
    else:
        #Use the one already there
        if ratio_df.iloc[result,0] != 1:
            #not a previously made mistake: insert into local and mysql mistake table
            vect = [1, ratio_df.iloc[result,1], name]
        else:
            vect = [2, ratio_df.iloc[result,1], None]
    #END CHECK-------------------------------------------------
    
    return vect

# Main Program Loop

In [None]:
print("TODO: populate lookup and mistake tables from old operations")

#Import the lookup and mistake tables
lookup_df = importLookupTable(cnx)
mistake_df = importMistakeTable(cnx)

print("TODO: make the NULL Sub_Vendor rows auto complete")

print("TODO: loop through the rows in MySQL by 100 at a time")
#Get the non-null and non-completed rows
sqlcommand = """SELECT Sub_Vendor 
                FROM mydata 
                WHERE Audit_Dashboard_Vendor IS NULL AND Sub_Vendor IS NOT NULL
                LIMIT 10"""
mydata_df = pd.read_sql(sqlcommand, con = cnx)

#Loop through the rows to clean (non NULL)
for index, row in mydata_df.iterrows():
    #Gets all the ratios
    ratio_df = nameMatching(row['Sub_Vendor'], lookup_df, mistake_df)
    
    #GUI: Do something with the information you have received
    result = int(fakeGUI(ratio_df, row['Sub_Vendor']))
    
    if result >= 0: #User chose a row
        if ratio_df.iloc[result,0] != 1:
            #not a previously made mistake: insert into local and mysql mistake table
            mistake_df = addMistake(row['Sub_Vendor'], ratio_df.iloc[result,1], mistake_df)
    elif result == -2: #User chose to make custom result
        #choice format ["type", "lookupVal", "mistakeVal"]
        choice = customValue(lookup_df, mistake_df, row['Sub_Vendor'])

        #0: using custom, 1: using prev & update mistake table, 2: using prev
        if choice[0] == 0: #Use the custom typed one
            #Add it to the lookup table
            lookup_df = addLookup(choice[1], lookup_df, False)
            #Add it to the mistake table
            mistake_df = addMistake(row['Sub_Vendor'], choice[1], mistake_df)
        elif choice[0] == 1: #Use one already in lookup
            #Add it to the lookup table
            lookup_df = addLookup(choice[1], lookup_df, False)
            #Add it to the mistake table
            mistake_df = addMistake(row['Sub_Vendor'], choice[1], mistake_df)
        else: #has to be 2  #Use one already in lookup
            #Add it to the lookup table
            lookup_df = addLookup(choice[1], lookup_df, False)

    #Insert the change into mydata.Audit_Dashboard_Vendor
    print("TODO: insert change into mydata")
#END LOOP

#END OF THE ACTUAL PROGRAM-----------------------------------------------------------

#### Close connections when you are done

In [None]:
#Closes the connection to the database and the cursor
cursor.close()
cnx.close()

In [None]:
print(mistake_df)
