In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Load the dataset
data_path = "full_user_anomaly_detection_data.csv"  # Update with the correct path if needed
df = pd.read_csv(data_path)

# Display the first few rows to understand the data structure
print(df.head())

# Train the Isolation Forest model
model = IsolationForest(contamination=0.1, random_state=42)  # Adjust contamination based on your expectations
model.fit(df)

# Predict anomalies (-1 indicates anomaly, 1 indicates normal)
df['Anomaly'] = model.predict(df)

# Count the total number of anomalies
total_anomalies = df[df['Anomaly'] == -1].shape[0]

# Display the results
print(f"Total records: {len(df)}")
print(f"Number of anomalies detected: {total_anomalies}\n")

if total_anomalies > 0:
    print("Anomalous records detected:")
    anomalies = df[df['Anomaly'] == -1]
    print(anomalies)
else:
    print("No anomalies detected.")


      Income  Deductions  Filing_Status  Home_Loan_Interest  \
0   500000.0      6000.0              1            200000.0   
1   600000.0     15000.0              2            180000.0   
2   550000.0      7000.0              3            190000.0   
3   580000.0     10000.0              1            170000.0   
4  1500000.0    600000.0              3            400000.0   

   Education_Loan_Interest  Charitable_Donations  
0                 100000.0               14000.0  
1                  90000.0               12000.0  
2                  95000.0               13000.0  
3                  85000.0               10000.0  
4                 200000.0              400000.0  
Total records: 505
Number of anomalies detected: 51

Anomalous records detected:
           Income     Deductions  Filing_Status  Home_Loan_Interest  \
0    5.000000e+05    6000.000000              1       200000.000000   
1    6.000000e+05   15000.000000              2       180000.000000   
2    5.500000e+05    

In [4]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import numpy as np
import joblib  # or use pickle-----------

# Load the dataset for training
data_path = "full_user_anomaly_detection_data.csv"  # Update with the correct path if needed
df = pd.read_csv(data_path)

# Train the Isolation Forest model
model = IsolationForest(contamination=0.1, random_state=42)
model.fit(df)

# Calculate the mean and standard deviation for each column for Z-score analysis
data_stats = df.describe().T

# Function to identify anomalies and provide suggestions
def analyze_anomaly(new_data):
    reasons = []
    suggestions = []

    for column in data_stats.index:
        mean_val = data_stats.loc[column, 'mean']
        std_dev = data_stats.loc[column, 'std']

        # Calculate Z-score
        z_score = (new_data[column] - mean_val) / std_dev

        if abs(z_score) > 3:  # If Z-score > 3, it's considered an anomaly
            anomaly_type = 'high' if z_score > 3 else 'low'
            reasons.append(f"{column} is too {anomaly_type} (Z-score: {z_score:.2f})")

            # Suggestion based on typical values
            if anomaly_type == 'high':
                suggestions.append(f"Reduce {column} closer to the average of {mean_val:.2f}.")
            else:
                suggestions.append(f"Increase {column} closer to the average of {mean_val:.2f}.")

    return reasons, suggestions

# Function to take new user inputs
def get_new_user_input():
    print("\nEnter the details for the new record:")
    income = float(input("Income: "))
    deductions = float(input("Deductions: "))
    filing_status = int(input("Filing Status (1: Single, 2: Married, 3: Head of Household): "))
    home_loan_interest = float(input("Home Loan Interest: "))
    education_loan_interest = float(input("Education Loan Interest: "))
    charitable_donations = float(input("Charitable Donations: "))

    # Create a DataFrame for the new input
    new_data = pd.DataFrame([[
        income, deductions, filing_status, home_loan_interest, education_loan_interest, charitable_donations
    ]], columns=[
        "Income", "Deductions", "Filing_Status", "Home_Loan_Interest", "Education_Loan_Interest", "Charitable_Donations"
    ])

    return new_data

# Take new user inputs
new_user_data = get_new_user_input()

# Predict whether the new data is an anomaly or not
prediction = model.predict(new_user_data)

# Display the result
if prediction[0] == -1:
    print("\nThe entered record is an ANOMALY (Outlier).")
    reasons, suggestions = analyze_anomaly(new_user_data.iloc[0])

    # Display detailed reasons and suggestions
    print("\nReasons for being classified as an anomaly:")
    for reason in reasons:
        print(f"- {reason}")

    print("\nSuggestions to resolve the anomaly:")
    for suggestion in suggestions:
        print(f"- {suggestion}")
else:
    print("\nThe entered record is NORMAL.")

# Save the model------------
joblib.dump(model, 'model.pkl')  # Use 'pickle' if you prefer




Enter the details for the new record:
Income: 200
Deductions: 10
Filing Status (1: Single, 2: Married, 3: Head of Household): 2
Home Loan Interest: 40
Education Loan Interest: 60
Charitable Donations: 90

The entered record is an ANOMALY (Outlier).

Reasons for being classified as an anomaly:
- Income is too low (Z-score: -11.21)
- Filing_Status is too high (Z-score: 3.85)
- Home_Loan_Interest is too low (Z-score: -13.13)
- Education_Loan_Interest is too low (Z-score: -6.66)

Suggestions to resolve the anomaly:
- Increase Income closer to the average of 503222.87.
- Reduce Filing_Status closer to the average of 1.05.
- Increase Home_Loan_Interest closer to the average of 151560.01.
- Increase Education_Loan_Interest closer to the average of 51000.80.


['model.pkl']

In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import joblib  # or use pickle

# Load your dataset
data_path = "full_user_anomaly_detection_data.csv"
df = pd.read_csv(data_path)

# Train the Isolation Forest model
model = IsolationForest(contamination=0.1, random_state=42)
model.fit(df)

# Save the model
joblib.dump(model, 'model.pkl')  # Use 'pickle' if you prefer




['model.pkl']

In [None]:
# Load your dataset
import pandas as pd

# Path to your data
file_path = 'full_user_anomaly_detection_data.csv'
df = pd.read_csv(file_path)

# Calculate mean and standard deviation for each column
data_stats = df.describe().loc[['mean', 'std']]
print(data_stats)

# Save this data to a new CSV file
data_stats.to_csv('data_stats.csv')


             Income    Deductions  Filing_Status  Home_Loan_Interest  \
mean  503222.874747   6295.043686       1.051485       151560.012226   
std    44857.577581  26476.751339       0.246650        11544.339295   

      Education_Loan_Interest  Charitable_Donations  
mean             51000.797162           5877.103781  
std               7644.871643          17585.204291  
