In [None]:
### import dependencies 

# data mgmt
import pandas as pd
from pandas import DataFrame
import numpy as np
import re

# dates
from datetime import date

# google sheets
import pygsheets

# SQL
from sqlalchemy import create_engine
import pymysql.cursors 
import cryptography
import os

In [None]:
### pull associate ids (THIS BE REPLACED BY A QUERY TO THE AF DEV REDSHIFT)

# environ variables for db access
db_user = os.environ.get('USER')
db_password = os.environ.get('PASS')
db_host = os.environ.get('HOST')
db_port = 25060
db = 'database'

# connect to ts database
connection = pymysql.connect(host=db_host,
                             user=db_user,
                             password=db_password,
                             port=db_port,
                             database=db,
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

# create cursor
cursor=connection.cursor()

# select all associate ID data
sql = "SELECT * FROM database.medadata;"
cursor.execute(sql)
meta_df = pd.DataFrame(cursor.fetchall())

# commit + close connection
connection.close()

In [None]:
### SET snapFile TO THE NAME OF THE FILE YOU WANT TO USE 

# create variable for today's date
today = date.today()

# create a filename including today's date
snapFile = 'file_list' + str(today) + '.csv' 
# IF YOU ARE RUNNING gdrive_metadata_request.ipynb WITH A file_list_[date].csv FROM 
# DATE OTHER THAN today(), YOU WILL NEED TO MANUALLY EDIT THE VALUE OF snapFile
# this step is required to ensure that you are working with the right data set

In [None]:
### read snapshot file meta-data from csv

# this file is creatded using the gdrive_metadata_request.ipynb script
# it includes a number of manual steps to identify sheets for import 
# and to attach employee IDs - open to a more automated approach but 
# did not develop one given the inconsistencies in the data

# read snapFile
try:
    df = pd.read_csv(snapFile)
except FileNotFoundError as error:
        print(f'check your snapFile \nFileNotFoundError: {error}')

# df.shape # check your work 

In [None]:
### select only the files we want to read

# use the 'use' column to create a boolean mask of files we want
dfmask = df['USE'].isin([1])
snapDf = df[dfmask]

# reset index on snapDf so that pullData can itterate through it
snapDf = snapDf.reset_index()
snapDf = snapDf.drop('index',axis=1)

# snapDf.head(2) # check your work

In [None]:
### function to pull data from google sheets specified in snapDf

# in order to work, the service account referecned in service_file 
# will need to be permissioned to each google sheet listed in snapDf
def pullData(fileName,sheet,headers,values):
    # authorization
    gc = pygsheets.authorize(service_file='pygsheets.json')

    # open the spreadsheet
    sh = gc.open(fileName)#af_file)

    # define which sheet to open by sheet name
    wks = sh.worksheet('title',sheet)#af_sheets[0])
    vals = wks.range(values, returnas='matrix')

    # pull data into df
    data = DataFrame (vals, columns=wks.range(headers, returnas='matrix'))
    return(data)

In [None]:
### function to standardize the size/ shape of data 

# generally, the scorecared data should be in columns A:E with 
# headers in row 4. however for older files, and possilby for 
# new file formats, this will not be the case
def shapeData(data):
    if data.shape[1] == 5:
        data2 = data
    if data.shape[1] == 3:
        data2 = data
        data2.columns = ['module','score','source_dep']
        data2 = data2.assign(competency = lambda x: data2['source_dep'])
        data2 = data2.assign(evidence = lambda x: "")
        data2 = data2[['module','competency','score','source_dep','evidence']]        
    else:
        data2 = data
    return(data2)

In [None]:
### function to reformat snapshot data

def cleanData(dataVal,evalDate,candidateName,assocId,mileStone):#,asc_ids):
    dataVal.columns = ['module_dep','competency','score','source_dep','evidence']#,'metric_id']

    # clear rows for which there is no 'score'
    df_mask = ~dataVal['score'].isin([""])
    df2 = dataVal[df_mask]

    # metadat for spreading 'module' accross all relevant rows
    df2 = df2.assign(mod_name = lambda x: df2['module_dep'].apply(lambda x: 0 if x == "" else 1))
    df2 = df2.assign(join = lambda x: df2['mod_name'].cumsum())

    # temp table to map 'module', 'name' records to join
    df_mask2 = df2['mod_name'].isin([1])
    df3 = df2[df_mask2]
    df3 = df3[['join','module_dep']]
    df3.columns = ['index','module']
    df3[['name','employee_id','milestone_id']] = [candidateName,assocId,mileStone] #this will iterate through the list once that becomes relevant

    # merge df2 and temp table to add 'module', 'name', and 'employee_id'
    df4 = df2.merge(df3[['index','module','name','employee_id','milestone_id']],how='left', left_on='join', right_on='index')
    df4 = df4[['employee_id','name','module','competency','score','source_dep','evidence','milestone_id']]#'metric_id',

    # metadata for spreading 'source' accross all relevant rows
    df4['source_name'] = df4['source_dep'].apply(lambda x: 0 if x == "" else 1)
    df4['join'] = df4['source_name'].cumsum()

    # temp table to map 'source' names to join
    df_mask4 = df4['source_name'].isin([1])
    df5 = df4[df_mask4]
    df5 = df5[['join','source_dep']]
    df5.columns = ['index','source']

    # merge df4 and temp table to add SOURCE, then add dummy columns for evaluator data (we may or may not be able to include these stats)
    df6 = df4.merge(df5[['index','source']],how='left', left_on='join', right_on='index') 
    df6['eval_date'] = pd.to_datetime(evalDate)  
    df6[['evaluator','eval_employee_id','source_id']] = ['evaluator','eval_employee_id','S00-CS-0007']
    df6 = df6[['eval_date','employee_id','name','module','competency','score','source','evidence','evaluator','eval_employee_id','milestone_id','source_id']]
    return(df6)

In [None]:
###function to combine all dataframes

def createDf(snapDf):
    item_count = 0
    output = pd.DataFrame() 
    while True:
        # file specific variables for pullData and cleanData
        fileName = snapDf.iloc[:,1][item_count]
        sheet = snapDf.iloc[:,4][item_count]
        evalDate = snapDf.iloc[:,5][item_count]
        mileStone = snapDf.iloc[:,7][item_count]
        candidateName = snapDf.iloc[:,9][item_count]
        assocId =  snapDf.iloc[:,10][item_count]
        headers = snapDf.iloc[:,12][item_count]
        values = snapDf.iloc[:,13][item_count]

        # print metadata for diagnosing errors
        print(item_count, fileName)
        print(sheet, assocId, headers, values)

        # run functions
        data = pullData(fileName,sheet,headers,values)
        dataVal = shapeData(data)
        out = cleanData(dataVal,evalDate,candidateName,assocId,mileStone) 

        # append new data to df
        output = pd.concat([output,out])
        item_count +=1
        if item_count == len(snapDf):
            break
    return(output)

scoreDf = createDf(snapDf)
scoreDf = scoreDf.astype(object).replace(np.nan, '')

# scoreDf.head() # check your work

In [None]:
### clean and standardize data for 'metric_id' join

scoreDf2 = scoreDf
scoreDf2['module'] = scoreDf2['module'].apply(lambda x: "module1" if str(x).__contains__("module1") else x)
scoreDf2['module'] = scoreDf2['module'].apply(lambda x: "module2" if str(x).__contains__("module2") else x)
scoreDf2['module'] = scoreDf2['module'].apply(lambda x: "module3" if str(x).__contains__("module3") else x)
scoreDf2['module'] = scoreDf2['module'].apply(lambda x: "module4" if str(x).__contains__("module4") else x)

In [None]:
### pull in metric ids from external source

# bring in metric_id mapping to strings in candidate snapshots
dfMetId = pd.read_csv('metric_id_to_string_map.csv')

# merge metric ids into snapshot data
mergeDf = scoreDf2.merge(dfMetId[['module','competency','metric_id']],how='left', left_on=['module','competency'], right_on=['module','competency'])
mergeDf.head(2)

# reorder columns and drop unwanted
mergeDf2 = mergeDf[['eval_date', 'employee_id', 'name', 'module',
                'competency', 'score', 'source', 'evidence', 'evaluator',
                'eval_employee_id','metric_id', 'milestone_id', 'source_id']]

# fix milestone_ids and clean NaN values for DB
mergeDf2['milestone_id'] = mergeDf2['milestone_id'].apply(lambda x: "0" + str(x)[:1] if str(x).__contains__("0") else "0" + x)
mergeDf3 = mergeDf2.astype(object).replace(np.nan, '')

### check your work
# mergeDf3.head()
# snapCheck = 'snapshot_data_' + str(today) + '.csv'
# mergeDf3.to_csv(snapCheck)

In [None]:
### import values to MySQL db - to be replaved with AF Redshift 

# connect to ts database
connection = pymysql.connect(host=db_host,
                             user=db_user,
                             password=db_password,
                             port=db_port,
                             database=db,
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

# create cursor
cursor=connection.cursor()

# create column list for insertion 
cols = "`,`".join([str(i) for i in mergeDf3.columns.tolist()])

# insert DataFrame records one by one 
for i,row in mergeDf3.iterrows():
    sql = "INSERT INTO `interview_data` (`" +cols + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)"
    cursor.execute(sql, tuple(row))

    # connection is not autocommit by default
    connection.commit()

connection.close()