In [7]:
import pandas as pd
# step 1: Load the needed files and merge info
# load tagged catalog (with data classification)
tagged_catalog = pd.read_csv('tagged_catalog.csv')

#load employee access info
employees = pd.read_csv('employees.csv')

#show columns as samples to verify
print("Tagged catalog columns:", tagged_catalog.columns.tolist())
print(tagged_catalog.head())

print("employees columns:",employees.columns.tolist())
print(employees.head())

Tagged catalog columns: ['dataset_name', 'Column_name', 'sample_name', 'data_classification']
  dataset_name  Column_name            sample_name data_classification
0    customers  customer_id               CUST1000          identifier
1    customers         name           Allison Hill                 PII
2    customers        email  jillrhodes@miller.com                 PII
3    customers          dob             1960-05-08                 PII
4    customers      country              Macedonia             General
employees columns: ['employee_id', 'name', 'role', 'dataset_name', 'access_level']
  employee_id               name                role  dataset_name  \
0      EMP200  Michael Valentine       data_engineer     customers   
1      EMP201    Susan Davis DDS       data_engineer     customers   
2      EMP202      Alicia Parker       data_engineer  transactions   
3      EMP203     Gloria Miranda  compliance_officer     customers   
4      EMP204      Zachary Moore  compliance_of

In [21]:
#Step 2: Define sensitive data categories to check
#Senistive categories that need restricted access
sensitive_categories= ['PII','Financial']


#Step 3: Identify which datasets contain sensitive data
#find dataset that have any sensitive columns
sensitive_datasets = tagged_catalog[tagged_catalog['data_classification'].isin(sensitive_categories)]['dataset_name'].unique()
print('Sensitive datasets:',sensitive_datasets)


#Step 4: Define access rules (example)
#Example access rules byb role for sensitive datasets
#Roles allowes to access sentsitive data fully

allowed_roles = ['Manager','data_engineer','compliance_officer']

#Roles not allowed to access sensitive datasets (or limited)
restricted_roles = ['intern','analyst']



#Step 5: Audit employee access
#Function to flag risky access
def audit_access(row):
    dataset = row['dataset_name']
    role = row['role'].lower()
    access = row['access_level'].lower()

    if dataset in sensitive_datasets:
        if role in restricted_roles:
            return 'RISK - Unauthorized access to sensitive data'
        elif role in allowed_roles and access =='read':
            return 'Warning -Limited access to sensitive data'
        else:
            return 'OK'

    else:
        return 'OK'

#apply the audit function 

employees['access_audit']= employees.apply(audit_access, axis=1)

#view eisky entries only
risk_report = employees[employees['access_audit'] != 'OK']

print(risk_report)

#Step 6: Save the audit report
risk_report.to_csv('access_risk_report.csv', index=False)
print("Access risk report saved as access_risk_report.csv")

Sensitive datasets: ['customers' 'employees' 'reports' 'transactions']
   employee_id                  name                role  dataset_name  \
0       EMP200     Michael Valentine       data_engineer     customers   
1       EMP201       Susan Davis DDS       data_engineer     customers   
2       EMP202         Alicia Parker       data_engineer  transactions   
3       EMP203        Gloria Miranda  compliance_officer     customers   
9       EMP209              Lori Orr              intern  transactions   
10      EMP210           Amanda Diaz             analyst  transactions   
11      EMP211          Tracy Bishop             analyst       reports   
12      EMP212          Rachel Young  compliance_officer  transactions   
15      EMP215       Jessica Hammond              intern       reports   
16      EMP216        Kathryn Barnes  compliance_officer       reports   
17      EMP217         Curtis Jacobs  compliance_officer  transactions   
18      EMP218          Amy Castillo     

Sensitive datasets: ['customers' 'employees' 'reports' 'transactions']
