# Supervised model for Audit Scoring

In [43]:
#%load_ext autoreload

In [2]:
%autoreload 2
%aimport audit_functions

In [102]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from dstools.mlutils.corp_tax_audit_unsupervised import add_abs_diffs
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score

In [2]:
%store -r q_industries
%store -r q_audit_data_combined

In [3]:
q_industries.head()

Unnamed: 0_level_0,index,naics2,business_id,bus_loc_id,naics_code,business_legal_name,maxNumLoc,tax_period_cd,obl_type_id,sumsum_gross,...,4D_eff_tax%_perc_glob,4D_deduc2income_perc_glob,4D_eff_tax_perc_ind,4D_deduc2income_perc_ind,k-cluster_ind,k-cluster_g,sumsum_gross_perc_glob,sumsum_gross_perc_ind,is_ccluster_ind,is_ccluster_g
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001Q1,0,22,109874,110874,221122,ISLAND UTILITY COMPANY,1,Q,10,16193.58,...,0.500045,0.5,0.545455,0.545455,0.0,1,0.085914,0.090909,1,1
2001Q2,1,22,109874,110874,221122,ISLAND UTILITY COMPANY,1,Q,10,158968.92,...,0.500184,0.499954,0.545455,0.545455,0.0,1,0.586863,0.636364,1,1
2001Q3,2,22,109874,110874,221122,ISLAND UTILITY COMPANY,1,Q,10,95335.05,...,0.500093,0.500047,0.545455,0.545455,0.0,1,0.467638,0.545455,1,1
2001Q4,3,22,109874,110874,221122,ISLAND UTILITY COMPANY,1,Q,10,36138.27,...,0.499573,0.50019,0.545455,0.545455,1.0,2,0.241137,0.181818,0,0
2002Q1,4,22,109874,110874,221122,ISLAND UTILITY COMPANY,1,Q,10,7334.92,...,0.80425,0.498629,0.545455,0.590909,0.0,1,0.0236,0.090909,1,1


In [8]:
q_audit_data_combined.columns

Index(['date', 'original_index', 'business_id', 'assessment_amount', 'naics_2',
       'entity_name', 'first_period', 'last_period', 'change', 'change+',
       'change-', 'nquarters', 'index', 'naics2', 'bus_loc_id', 'naics_code',
       'business_legal_name', 'maxNumLoc', 'tax_period_cd', 'obl_type_id',
       'sumsum_gross', 'sumsum_deduc', 'sumsum_taxable', 'sumsum_paid',
       'eff_tax_rate%', 'deduc_to_income_ratio', 'eff_tax%_perc_glob',
       'eff_tax%_perc_ind', 'deduc2income_perc_glob', 'deduc2income_perc_ind',
       '4Delta_abs_eff_tax_rate%', '4Delta_abs_deduc_to_income_ratio',
       '4D_eff_tax%_perc_glob', '4D_deduc2income_perc_glob',
       '4D_eff_tax_perc_ind', '4D_deduc2income_perc_ind', 'k-cluster_ind',
       'k-cluster_g', 'sumsum_gross_perc_glob', 'sumsum_gross_perc_ind',
       'ischange+_outside_ccluster', 'ischange+OR-_outside_ccluster',
       'is_ccluster_ind', 'is_ccluster_g', 'ischange+_outside_ccluster_ind',
       'ischange+OR-_outside_ccluster_ind'],

# Naive Positive Change

In [53]:
q_audit_data_combined.shape

(6989, 46)

In [9]:
X = q_audit_data_combined[[ 'date','maxNumLoc', 
       'eff_tax%_perc_glob', 'eff_tax%_perc_ind', 
       'deduc2income_perc_glob', 'deduc2income_perc_ind',
       '4D_eff_tax%_perc_glob', '4D_deduc2income_perc_glob',
       '4D_eff_tax_perc_ind', '4D_deduc2income_perc_ind',
        'sumsum_gross_perc_glob', 'sumsum_gross_perc_ind'                 
                          ]].copy()

In [10]:
X.loc[:,'quarter'] = X['date'].dt.quarter

X.drop(columns=['date'], inplace=True)

In [52]:
# Constructing y
y = np.where(q_audit_data_combined['change+'], 1, np.where(q_audit_data_combined['change-'], 0, 0))
y

6989

In [12]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)



In [13]:
# Creating an instance of the LogisticRegression model
logreg = LogisticRegression(max_iter=200)

# Fitting the model on the training data
logreg.fit(X_train, y_train)

# Predicting on the test data
y_pred = logreg.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5836909871244635


In [14]:
# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print("Classification Report")
print("Logistic Regression - Naive Model - Positive Change")
print(report)

Classification Report
Logistic Regression - Naive Model - Positive Change
              precision    recall  f1-score   support

           0       0.60      0.74      0.66       770
           1       0.55      0.40      0.46       628

    accuracy                           0.58      1398
   macro avg       0.58      0.57      0.56      1398
weighted avg       0.58      0.58      0.57      1398



In [15]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print(report)

              precision    recall  f1-score   support

           0       0.55      1.00      0.71       770
           1       0.00      0.00      0.00       628

    accuracy                           0.55      1398
   macro avg       0.28      0.50      0.36      1398
weighted avg       0.30      0.55      0.39      1398



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Naive - Change

In [16]:
# Constructing y
y = np.where(q_audit_data_combined['change+'], 1, np.where(q_audit_data_combined['change-'], 1, 0))
y

array([1, 1, 1, ..., 1, 1, 1])

In [17]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [18]:
# Creating an instance of the LogisticRegression model
logreg = LogisticRegression(max_iter=200)

# Fitting the model on the training data
logreg.fit(X_train, y_train)

# Predicting on the test data
y_pred = logreg.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6809728183118741


In [19]:
# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print("Classification Report")
print("Logistic Regression - Naive Model - Change")
print(report)

Classification Report
Logistic Regression - Naive Model - Change
              precision    recall  f1-score   support

           0       0.68      0.25      0.37       514
           1       0.68      0.93      0.79       884

    accuracy                           0.68      1398
   macro avg       0.68      0.59      0.58      1398
weighted avg       0.68      0.68      0.63      1398



In [20]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print(report)

              precision    recall  f1-score   support

           0       1.00      0.04      0.07       514
           1       0.64      1.00      0.78       884

    accuracy                           0.65      1398
   macro avg       0.82      0.52      0.43      1398
weighted avg       0.77      0.65      0.52      1398



**Conclusions**

This second round of training increased the accuracy from 65% to 68% when adding sales percentiles; and from 55% to 58% to recognize positive change.

This could improve significantly if we change our way of labeling, and add the clustering.

Positive changes marked only in the periods outside the compliant cluster.

# Adding cluster as Feature

**First Attempt: global data**

All industries, 
First case: success (1) mark for POSITIVE CHANGE outside the compliant cluster. If this improve the results, we can add to all industries.

In [115]:
X = q_audit_data_combined[[ 'date','maxNumLoc', 
        'sumsum_gross_perc_ind', 'sumsum_gross_perc_glob',
       'eff_tax%_perc_glob', 'eff_tax%_perc_ind', 
       'deduc2income_perc_glob', 'deduc2income_perc_ind',
       '4D_eff_tax%_perc_glob', '4D_deduc2income_perc_glob',
       '4D_eff_tax_perc_ind', '4D_deduc2income_perc_ind',
       'k-cluster_g', 'k-cluster_ind']].copy()

y = q_audit_data_combined['ischange+_outside_ccluster_ind']


In [116]:
X.loc[:,'quarter'] = X['date'].dt.quarter

X.drop(columns=['date'], inplace=True)

In [117]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [118]:
# Creating an instance of the LogisticRegression model
logreg = LogisticRegression(max_iter=300)

# Fitting the model on the training data
logreg.fit(X_train, y_train)

# Predicting on the test data
y_pred = logreg.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8247496423462088


In [119]:
# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print("Classification Report")
print("Classification Report on Positive Change Applying Clustering.")
print("Changes labeled as 1 only if outside the compliant cLuster")
print("No imbalance added")
print(report)

Classification Report
Classification Report on Positive Change Applying Clustering.
Changes labeled as 1 only if outside the compliant cLuster
No imbalance added
              precision    recall  f1-score   support

           0       0.84      0.97      0.90      1149
           1       0.53      0.14      0.23       249

    accuracy                           0.82      1398
   macro avg       0.68      0.56      0.56      1398
weighted avg       0.78      0.82      0.78      1398



In [120]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print(report)

              precision    recall  f1-score   support

           0       0.82      1.00      0.90      1149
           1       0.00      0.00      0.00       249

    accuracy                           0.82      1398
   macro avg       0.41      0.50      0.45      1398
weighted avg       0.68      0.82      0.74      1398



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Observations**

The reason of failing to recognize 1s can be due to the imbalance. Will apply sampler oversampling

## Adding imbalance

In [121]:
# Applying imbalance
# Create an instance of RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Apply random oversampling to the training data
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

pd.Series(y_train_resampled).value_counts()

0    4617
1    4617
Name: ischange+_outside_ccluster_ind, dtype: int64

In [122]:
# # Split the resampled data into training and validation sets
# X_train_new, X_testval, y_train_new, y_val = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42)

# # Model training and evaluation using the resampled data
# # Replace X_train and y_train with X_train_new and y_train_new for training
# # Replace X_val and y_val with the original validation sets for evaluation

In [123]:
# Creating an instance of the LogisticRegression model
logreg = LogisticRegression(max_iter=300)

# Fitting the model on the training data
logreg.fit(X_train_resampled, y_train_resampled)

# Predicting on the test data
y_pred = logreg.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.7911301859799714


In [124]:
# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print("Classification Report on Positive Changes Applying Clustering.")
print("Positive changes labeled as 1 only if outside the compliant cLuster")
print("Imbalance added")
print(report)

Classification Report on Positive Changes Applying Clustering.
Positive changes labeled as 1 only if outside the compliant cLuster
Imbalance added
              precision    recall  f1-score   support

           0       1.00      0.75      0.85      1149
           1       0.46      1.00      0.63       249

    accuracy                           0.79      1398
   macro avg       0.73      0.87      0.74      1398
weighted avg       0.90      0.79      0.81      1398



In [125]:
print("Confusion Matrix on Positive Changes Applying Clustering.")
print("Positive changes labeled as 1 only if outside the compliant cLuster")
print("tn, fp, fn, tp")
confusion_matrix(y_test, y_pred)

Confusion Matrix on Positive Changes Applying Clustering.
Positive changes labeled as 1 only if outside the compliant cLuster
tn, fp, fn, tp


array([[858, 291],
       [  1, 248]])

In [126]:
report = classification_report_imbalanced(y_test, y_pred)
print("Imbalanced classification Report on Positive Changes Applying Clustering. Positive Changes are labeled as succesful only on Clusters Outside the Compliant CLuster")
print(report)

Imbalanced classification Report on Positive Changes Applying Clustering. Positive Changes are labeled as succesful only on Clusters Outside the Compliant CLuster
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.75      1.00      0.85      0.86      0.73      1149
          1       0.46      1.00      0.75      0.63      0.86      0.76       249

avg / total       0.90      0.79      0.95      0.81      0.86      0.73      1398



# Looking for best probabiliy threshold to improve precision

In [129]:
# Changing threshold to improve precision
# Predict probabilities for the test data
y_proba = logreg.predict_proba(X_test)

# Adjust the decision threshold
threshold = 0.75 # Set a higher threshold to increase precision
y_pred = (y_proba[:, 1] >= threshold).astype(int)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print("Classification Report - Positive Change")
print("Clustering Applied")
print("Changes labeled as 1 only if outside the compliant cLuster")
print(f"Imbalance added - Threshold proba > {threshold}")
print(report)
print("")
print("Confuction Matrix")
confusion_matrix(y_test, y_pred)

Classification Report - Positive Change
Clustering Applied
Changes labeled as 1 only if outside the compliant cLuster
Imbalance added - Threshold proba > 0.75
              precision    recall  f1-score   support

           0       0.93      0.86      0.89      1149
           1       0.52      0.70      0.60       249

    accuracy                           0.83      1398
   macro avg       0.73      0.78      0.75      1398
weighted avg       0.86      0.83      0.84      1398


Confuction Matrix


array([[989, 160],
       [ 74, 175]])

In [128]:
# Changing threshold to improve precision

metrics_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Accuracy', 'Recall'])

for threshold in range(50,99,5):
    # Predict probabilities for the test data
    y_proba = logreg.predict_proba(X_test)

    # Adjust the decision threshold
    y_pred = (y_proba[:, 1] >= threshold/100).astype(int)

    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    metrics = pd.DataFrame({'Threshold(%)': [threshold],
                        'Precision': [precision],
                        'Accuracy': [accuracy],
                        'Recall': [recall]})

    # Append the metrics to the DataFrame
    metrics_df = pd. concat([metrics_df, metrics], ignore_index=True)
    
metrics_df.hvplot(y=['Precision', 'Accuracy', 'Recall'], x='Threshold(%)',
                 title="Metrics depending on the Probability Threshold to Set the Success")

## Power to identify Change (positive or Negative)

In [131]:
y = q_audit_data_combined['ischange+OR-_outside_ccluster']

In [132]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [133]:
# Applying imbalance
# Create an instance of RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Apply random oversampling to the training data
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)


In [141]:
# Creating an instance of the LogisticRegression model
logreg = LogisticRegression(max_iter = 350)

# Fitting the model on the training data
logreg.fit(X_train_resampled, y_train_resampled)

# Changing threshold to improve precision
# Predict probabilities for the test data

y_proba = logreg.predict_proba(X_test)

# Adjust the decision threshold
threshold = 0.75 # Set a higher threshold to increase precision
y_pred = (y_proba[:, 1] >= threshold).astype(int)


In [142]:
# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print("Classification Report on Change (Positive or Negative) Applying Clustering.")
print("Changes labeled as 1 only if outside the compliant cLuster")
print(f"Probability Threshold: {threshold}")

print(report)

Classification Report on Change (Positive or Negative) Applying Clustering.
Changes labeled as 1 only if outside the compliant cLuster
Probability Threshold: 0.75
              precision    recall  f1-score   support

           0       0.94      0.87      0.90      1053
           1       0.68      0.82      0.74       345

    accuracy                           0.86      1398
   macro avg       0.81      0.85      0.82      1398
weighted avg       0.87      0.86      0.86      1398



In [140]:
# Changing threshold to improve precision

metrics_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Accuracy', 'Recall'])

for threshold in range(50,99,5):
    # Predict probabilities for the test data
    y_proba = logreg.predict_proba(X_test)

    # Adjust the decision threshold
    y_pred = (y_proba[:, 1] >= threshold/100).astype(int)

    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    metrics = pd.DataFrame({'Threshold(%)': [threshold],
                        'Precision': [precision],
                        'Accuracy': [accuracy],
                        'Recall': [recall]})

    # Append the metrics to the DataFrame
    metrics_df = pd. concat([metrics_df, metrics], ignore_index=True)
    
metrics_df.hvplot(y=['Precision', 'Accuracy', 'Recall'], x='Threshold(%)',
                 title="Metrics depending on the Probability Threshold to Set the Success")

In [144]:
print("Confusion Matrix on Change (Positive or Negative) Applying Clustering.")
print("Changes labeled as 1 only if outside the compliant cLuster")
print(f"Probability Threshold: {threshold}")
confusion_matrix(y_test, y_pred)

Confusion Matrix on Change (Positive or Negative) Applying Clustering.
Changes labeled as 1 only if outside the compliant cLuster
Probability Threshold: 0.75


array([[917, 136],
       [ 61, 284]])

**Conclusion**: it is hard to improve precision. I tried it by increasing the probability threshold for which  prediction is set to 1.
I tried several values starting in .50 al the way to 99, and found that around .75 increases precision, but by a few percentages.