In [1]:
#IMPORTATION OF LIBRARIES
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest # we are using it to get data that has high supplier risk

In [2]:
#DATA LOADING
df=pd.read_csv("government-procurement-via-gebiz.csv")

In [5]:
#2.FEATURE ENGINEERING
supplier_counts = df.groupby(['agency','supplier_name']).size().reset_index(name='supplier_count')
#we want to count how many times each supplier was awarded a contract by each agency.
df=df.merge(supplier_counts,on=['agency','supplier_name'])
#merging the original df and supplier_counts so as to add the column supplier_count to df
df['supplier_risk']=np.log(df['supplier_count']+1)
#new column created 'supplier_risk' to df 
# np.log is  often used when you have counts that vary a lot (some suppliers might appear once, others hundreds of times).
# Applying a logarithm helps to make these numbers more manageable and reduces the impact of very high counts

In [7]:
#3. Feature Engineering: Amount Anomaly
agency_stats =df.groupby('agency')['awarded_amt'].agg(['mean','std']).reset_index()
# Calculates mean and standard deviation of awarded amounts per agency

df=df.merge(agency_stats,on='agency')
# Merges these statistics back into the main DataFrame.

df['amount_risk']=(df['awarded_amt']-df['mean'])/df['std']
# Computes Z-scores for each tender amount within its agency group.
#Measures how many standard deviations an amount is from its agency mean.


In [9]:
#4.  Feature Engineering: Status Risk (Conditional)

if 'tender_detail_status' in df.columns: #Checks if the status column exists
    suspicious_statuses = ['cancelled', 'direct award', 'negotiated']#Defines suspicious status keywords
    df['status_risk']=df['tender_detail_status'].str.lower().str.contains('|'.join(suspicious_statuses)).astype(int)
    #Converts status text to lowercase and Flags rows containing any suspicious keywords (1=risky, 0=normal).

In [11]:
#5. Risk Aggregation
df['total_risk'] = df['supplier_risk'] + df['amount_risk'].abs()
# combines different "risk" factors into a single total_risk score for each procurement record.
#Sums supplier risk and absolute amount risk(which quantifies how unusually high or low an awarded amount is compared to the agency's average)


In [17]:
print(df[['supplier_risk', 'amount_risk']].isnull().sum())
df= df.dropna(subset=['supplier_risk', 'amount_risk'])

supplier_risk    0
amount_risk      2
dtype: int64


In [19]:
#6. Anomaly Detection
model = IsolationForest(contamination=0.1,random_state=42)
# Initializes Isolation Forest model
#`contamination=0.1`: Expects 10% of data to be anomalies
#`random_state=42`: Ensures reproducibility

df['anomaly_score']=model.fit_predict(df[['supplier_risk','amount_risk']])
# Fits model to supplier/amount risk features.
#Returns `-1` for anomalies, `1` for normal points.

df['is_high_risk'] = (df['anomaly_score'] == -1).astype(int)
# Converts anomaly scores to binary flags (1=high risk, 0=normal).


In [35]:
#7. Save results
df[['tender_no.','agency','supplier_name','awarded_amt','total_risk','is_high_risk']].to_csv("risk_assessment.csv",index=False)
print("Results saved to 'risk_assessment.csv'")

Results saved to 'risk_assessment.csv'
