# **Gamma Topic Model: Baseline**

# Import required libraries

In [2]:
import pandas as pd
import re
import spacy

import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

# Data preprocessing

## Load dataset

In [3]:
import pandas as pd
data = pd.read_csv('result_tier_df.csv')

In [4]:
data.head()

Unnamed: 0,url,snippet,tier 1,tier 2,tier 3,tier 4,tier 5,tier 6,tier 7
0,https://www.lawtechnologytoday.org/2022/09/no-...,Cybercrimes have become more frequent and soph...,,,,,,,
1,https://www.sun-sentinel.com/1990/11/20/jury-c...,A Boynton Beach man who stabbed a co-worker to...,['violent crimes'],['homicide'],['manslaughter'],['manslaughter'],['manslaughter'],['manslaughter'],['manslaughter']
2,https://www.nbclosangeles.com/news/more-cellph...,Five suspected thieves working in groups were ...,"['identity crimes', 'theft crimes']","['identity theft', 'larceny', 'stolen property...",['possessing stolen property'],['possessing stolen property'],['possessing stolen property'],['possessing stolen property'],['possessing stolen property']
3,https://www.sun-sentinel.com/1997/08/20/attorn...,The bribery and campaign corruption case again...,['corruption'],"['bribery', 'public corruption']",['bribery (financial)'],['acceptance or solicitation (financial) to ob...,['acceptance or solicitation (financial) to ob...,['acceptance or solicitation (financial) to ob...,['acceptance or solicitation (financial) to ob...
4,https://www.nbcnews.com/tech/tech-news/fraudst...,Tech-savvy fraudsters stealing from the govern...,"['cybercrime', 'financial crimes']","['computer fraud', 'financial fraud']",['major fraud against the us'],['major fraud against the us'],['major fraud against the us'],['major fraud against the us'],['major fraud against the us']


## Split the dataset into train, validation, and test sets

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into 80% (train + validation) and 20% (test)
data_train_val, data_test = train_test_split(data, test_size=0.2, random_state=15) 

In [6]:
# Split the 80% into 60% (train) and 20% (validation)
data_train, data_val = train_test_split(data_train_val, test_size=0.25, random_state=15)

In [None]:
import pickle

with open("train_test_val.pkl", "wb") as f:
    pickle.dump((train_data, test_data, val_data), f)

## Clean the train set

In [7]:
import pandas as pd
import spacy
import re
import json
from bs4 import BeautifulSoup

# Load SpaCy model once
nlp = spacy.load("en_core_web_sm")

# Load city and state names from CSV
file_path = "us_cities_states_counties.csv"
df_locations = pd.read_csv(file_path, delimiter="|")

# Create sets of city and state names for quick lookup
cities = set(df_locations["City"].str.lower().unique())
states = set(df_locations["State full"].str.lower().unique())

# Combine all location names into one set
location_names = cities | states

def remove_locations(text):
    """Removes city and state names from the given text."""
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in location_names]
    return " ".join(cleaned_words)

def clean_text(text):
    """Preprocess text by removing stop words, named entities (persons, locations, dates, etc.), 
    punctuation, and numbers. Keeps original word forms (no lemmatization)."""
    if pd.isna(text):  # Handle missing values
        return ""
    # print("\n=== ORIGINAL TEXT ===")
    # print(text)

    try:
        text = json.loads(text) # convert JSON string to dict
        text = text.get("rendered", text) # extract main text if key exists
    except:
        pass # if it's not JSON, continue normally
    
    text = BeautifulSoup(text, "html.parser").get_text() # Remove HTML
    text = remove_locations(text) # Remove cities and states

    text = re.sub(r"\\/", "/", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\n+", " ", text)

    doc = nlp(text)
    #doc_lower = [token.text.lower() for token in doc]
    
    entities_to_remove = {"PERSON", "GPE", "LOC", "DATE", "TIME"}
    
    filtered_tokens = [
        token.text.lower() for token in doc
        if not token.is_stop  # Remove stop words
        and not token.is_punct  # Remove punctuation
        and not token.like_num  # Remove numbers
        and token.ent_type_ not in entities_to_remove  # Remove specific named entities
    ]
    # print("\n===TOKENS AFTER FILTERING ===")
    # print(filtered_tokens)

    cleaned_text = " ".join(filtered_tokens)

    cleaned_text = re.sub(r"[^a-zA-Z\s]", "", cleaned_text).strip()

    # print("\n=== FINAL CLEANED TEXT ===")
    # print(cleaned_text)

    return cleaned_text

# Apply the function to the 'snippet' column
# X_train_cleaned = [clean_text(text) for text in X_train]
# X_train_cleaned

data_train["cleaned_text"] = data_train['snippet'].apply(clean_text)


  text = BeautifulSoup(text, "html.parser").get_text() # Remove HTML


In [8]:
data_train.columns

Index(['url', 'snippet', 'tier 1', 'tier 2', 'tier 3', 'tier 4', 'tier 5',
       'tier 6', 'tier 7', 'cleaned_text'],
      dtype='object')

In [9]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6216 entries, 10200 to 6064
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           6216 non-null   object
 1   snippet       6216 non-null   object
 2   tier 1        5683 non-null   object
 3   tier 2        5683 non-null   object
 4   tier 3        5683 non-null   object
 5   tier 4        5683 non-null   object
 6   tier 5        5683 non-null   object
 7   tier 6        5683 non-null   object
 8   tier 7        5683 non-null   object
 9   cleaned_text  6216 non-null   object
dtypes: object(10)
memory usage: 534.2+ KB


In [10]:
data_train.replace('nan', np.nan, inplace=True)
data_train.dropna(inplace=True)
data_train.reset_index(drop=True, inplace=True)

data_val.replace('nan', np.nan, inplace=True)
data_val.dropna(inplace=True)
data_val.reset_index(drop=True, inplace=True)


# Model training

## Create pipeline to train model on each topic tier & return evaluation metrics

In [11]:
import ast
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

non_topic_cols = ['url', 'snippet', 'tier 1', 'tier 2', 'tier 3', 'tier 4', 'tier 5',
       'tier 6', 'tier 7', 'cleaned_text']

label_cols = ['tier 1', 'tier 2', 'tier 3', 'tier 4', 'tier 5', 'tier 6', 'tier 7']

# Create dummy columns for the desired topic tier
def create_dummy_columns(df, label_col):
    try:
        df[label_col] = df[label_col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except Exception as e:
        print(f"Error in ast.literal_eval: {e}")
        return df
    
    dummy_df = df[label_col].apply(lambda x: pd.Series(1, index=x)).fillna(0).astype(int)
    return df.join(dummy_df)

# Baseline model pipeline
def train_baseline(train_df, val_df, label_col, model_store):
    train_dummy = create_dummy_columns(train_df, label_col)
    val_dummy = create_dummy_columns(val_df, label_col)
    
    topic_names = list(set(train_dummy.columns) & set(val_dummy.columns) - set(non_topic_cols))

    X_train = train_dummy['cleaned_text']
    y_train = train_dummy[topic_names]
    X_val = val_df['snippet']
    y_val = val_dummy[topic_names]
    
    models = {
        'Decision Tree': OneVsRestClassifier(DecisionTreeClassifier()),
        'XGBoost': OneVsRestClassifier(XGBClassifier(eval_metric="logloss"))
    }

    if label_col not in model_store:
        model_store[label_col] = {}

    y_train_pred_dict = {}
    for model_name, model in models.items():
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('classifier', model)
        ])    
        pipeline.fit(X_train, y_train)
        model_store[label_col][model_name] = pipeline
        y_train_pred_dict[model_name] = pipeline.predict(X_train)

    return model_store, topic_names, X_train, y_train, y_train_pred_dict, X_val, y_val

test_output = train_baseline(data_train, data_val, label_cols[0], {})
print(len(test_output))  # Should print 7
print(type(test_output))  # Should be a tuple

# Evaluation function
def evaluate_accuracy_roc(y_true, y_pred):
    try:
        accuracy = [
            accuracy_score(y_true.iloc[:, i], y_pred[:, i])  
            for i in range(y_true.shape[1])
        ]
        roc_auc = [
            roc_auc_score(y_true.iloc[:, i], y_pred[:, i]) if len(set(y_true.iloc[:, i])) > 1 else None
            for i in range(y_true.shape[1])
        ]
    except ValueError:
        accuracy = [None] * y_true.shape[1]
        roc_auc = [None] * y_true.shape[1]  

    positive_counts = y_true.sum(axis=0).tolist()  
    total_samples = len(y_true)
    positive_ratios = [(count / total_samples) if total_samples > 0 else 0 for count in positive_counts]

    return {
        'Accuracy': accuracy, 
        'ROC AUC': roc_auc,
        'Positive Samples': positive_counts,
        'Positive Ratio': positive_ratios
    }

def evaluate_prf(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average=None, zero_division=1)
    recall = recall_score(y_true, y_pred, average=None, zero_division=1)
    f1 = f1_score(y_true, y_pred, average=None, zero_division=1)
    return {'Precision': precision, 'Recall': recall, 'F1 Score': f1}

def evaluate_models(train_df, val_df, label_cols):
    model_store = {}
    results = []
    X_val_store = {}   # Store X_val per label_col
    y_val_store = {}

    for label_col in label_cols:
        print(f"Processing tier: {label_col}")

        # Ensure all values are returned and unpacked correctly
        output = train_baseline(train_df, val_df, label_col, model_store)

        # Check that output has the expected length (7 items)
        if len(output) != 7:
            print(f"Unexpected output length from train_baseline() for {label_col}: {len(output)}")
            continue  # Skip this tier if the function is not returning the correct values
        
        model_store, topic_names, X_train, y_train, y_train_pred_dict, X_val, y_val = output
        # model_store, topic_names, X_train, y_train, y_train_pred_dict, X_val, y_val = train_baseline(train_df, val_df, label_col, model_store)

        # 🔴 Add this debug print
        print(f"DEBUG: X_val exists for {label_col}, type: {type(X_val)}")

        # Ensure only the relevant labels are stored for each tier in y_val_store
        y_val_store[label_col] = y_val[topic_names]  # Ensure we get the correct columns for this tier
        print(f"y_val_store for tier {label_col}: {y_val_store[label_col].columns.tolist()}")

        # Store X_val for each tier
        X_val_store[label_col] = X_val
        
        for model_name, pipeline in model_store[label_col].items():
            if 'X_val' not in locals():
                print(f"Error: X_val is not defined for {label_col}")
                continue  # Skip this model if X_val is missing
    
            y_val_pred = pipeline.predict(X_val)  
            val_acc_roc = evaluate_accuracy_roc(y_val, y_val_pred)
            val_prf = evaluate_prf(y_val, y_val_pred)

            y_train_pred = y_train_pred_dict[model_name]  # Use stored predictions
            train_acc_roc = evaluate_accuracy_roc(y_train, y_train_pred)
            train_prf = evaluate_prf(y_train, y_train_pred)

            for i, topic in enumerate(topic_names):
                for dataset, acc_roc, prf in [('Train', train_acc_roc, train_prf), ('Validation', val_acc_roc, val_prf)]:
                    results.append({
                        'Tier': label_col,
                        'Topic': topic,
                        'Model': model_name,
                        'Dataset': dataset,
                        'Accuracy': acc_roc['Accuracy'][i],  
                        'ROC AUC': acc_roc['ROC AUC'][i] if i < len(acc_roc['ROC AUC']) else None,
                        'Positive Samples': acc_roc['Positive Samples'][i],
                        'Positive Ratio': acc_roc['Positive Ratio'][i],
                        'Precision': prf['Precision'][i],
                        'Recall': prf['Recall'][i],
                        'F1 Score': prf['F1 Score'][i]
                    })
    
    return pd.DataFrame(results), model_store, X_val_store, y_val_store

metrics_df, model_store, X_val_store, y_val_store = evaluate_models(data_train, data_val, label_cols)


7
<class 'tuple'>
Processing tier: tier 1
DEBUG: X_val exists for tier 1, type: <class 'pandas.core.series.Series'>
y_val_store for tier tier 1: ['perjury', 'organized crime', 'kidnapping', 'drug crimes', 'property damage', 'record falsification', 'crimes against children', 'identity crimes', 'cybercrime', 'environmental crimes', 'disorderly conduct', 'making false statements', 'weapons crimes', 'criminal harassment', 'crimes against animals', 'smuggling of goods', 'obstruction of justice', 'providing or possessing contraband in prison', 'criminal tresspassing', 'fraudulent/counterfeit goods', 'immigration offenses', 'civil rights violation', 'theft crimes', 'criminal threats', 'forgery (non-financial/identity)', 'violent crimes', 'human trafficking', 'traffic crimes', 'election crimes', 'corruption', 'sex crimes', 'financial crimes', 'contempt of court', 'crimes against the government']
Processing tier: tier 2
DEBUG: X_val exists for tier 2, type: <class 'pandas.core.series.Series'>
y

KeyboardInterrupt: 

## Display results

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(metrics_df)

Unnamed: 0,Tier,Topic,Model,Dataset,Accuracy,ROC AUC,Positive Samples,Positive Ratio,Precision,Recall,F1 Score
0,tier 1,financial crimes,Decision Tree,Train,0.999471,0.999127,1319,0.232423,0.999241,0.998484,0.998862
1,tier 1,financial crimes,Decision Tree,Validation,0.92891,0.918581,446,0.23486,0.816701,0.899103,0.855923
2,tier 1,contempt of court,Decision Tree,Train,1.0,1.0,81,0.014273,1.0,1.0,1.0
3,tier 1,contempt of court,Decision Tree,Validation,0.982622,0.806876,32,0.016851,0.487805,0.625,0.547945
4,tier 1,environmental crimes,Decision Tree,Train,1.0,1.0,180,0.031718,1.0,1.0,1.0
5,tier 1,environmental crimes,Decision Tree,Validation,0.991575,0.952319,45,0.023697,0.773585,0.911111,0.836735
6,tier 1,criminal threats,Decision Tree,Train,0.998767,0.980922,158,0.027841,0.993464,0.962025,0.977492
7,tier 1,criminal threats,Decision Tree,Validation,0.988415,0.875445,69,0.036335,0.912281,0.753623,0.825397
8,tier 1,traffic crimes,Decision Tree,Train,0.999824,0.997854,233,0.041057,1.0,0.995708,0.997849
9,tier 1,traffic crimes,Decision Tree,Validation,0.957873,0.907224,81,0.042654,0.50365,0.851852,0.633028


## Save results to csv

In [None]:
metrics_df.to_csv('gamma_metrics_full.csv', index=False)

# Analyze feature importance for model-able topics

## Define the topic-tier selection dictionary

In [None]:
# Dictionary specifying topics and their corresponding tiers
selected_topics = {
    "financial crimes": "tier 1",
    "contempt of court": "tier 1",
    "environmental crimes": "tier 1",
    "criminal threats": "tier 1",
    "traffic crimes": "tier 1",
    "violent crimes": "tier 1",
    "criminal tresspassing": "tier 1",
    "crimes against animals": "tier 1",
    "corruption": "tier 1",
    "property damage": "tier 1",
    "human trafficking": "tier 1",
    "crimes against children": "tier 1",
    "cybercrime": "tier 1",
    "organized crime": "tier 1",
    "obstruction of justice": "tier 1",
    "theft crimes": "tier 1",
    "weapons crimes": "tier 1",
    "sex crimes": "tier 1",
    "kidnapping": "tier 1",
    "identity crimes": "tier 1",
    "fraudulent/counterfeit goods": "tier 1",
    "criminal harassment": "tier 1",
    "drug crimes": "tier 1",
    "disorderly conduct": "tier 1",
    "bribery": "tier 2",
    "sextortion": "tier 2",
    "homicide": "tier 2",
    "public intoxication": "tier 2",
    "torture": "tier 2",
    "computer fraud": "tier 2",
    "stolen property; buying, receiving, or possessing": "tier 2",
    "drug possession": "tier 2",
    "extortion": "tier 2",
    "embezzlement": "tier 2",
    "financial fraud": "tier 2",
    "disturbing the peace": "tier 2",
    "fraud in connection with identification documents": "tier 2",
    "hostage taking": "tier 2",
    "assault": "tier 2",
    "insider trading": "tier 2",
    "obstructing federal officer or employee": "tier 2",
    "larceny": "tier 2",
    "driving with license suspended": "tier 2",
    "identity theft": "tier 2",
    "failure to appear on felony offense": "tier 2",
    "terrorism": "tier 2",
    "prostitution": "tier 2",
    "dui/dwi": "tier 2",
    "illegal dumping": "tier 2",
    "cyber stalking": "tier 2",
    "tax evasion": "tier 2",
    "drug trafficking": "tier 2",
    "money laundering": "tier 2",
    "racketeering": "tier 2",
    "public corruption": "tier 2",
    "illegal gambling": "tier 2",
    "obscenity": "tier 2",
    "wildlife crimes": "tier 2",
    "weapons trafficking": "tier 2",
    "child abuse and neglect": "tier 2",
    "international organized crime": "tier 2",
    "burglary": "tier 2",
    "stalking": "tier 2",
    "probation violation": "tier 2",
    "sexual assault": "tier 2",
    "sex trafficking": "tier 2",
    "financial abuse": "tier 2",
    "reckless driving": "tier 2",
    "vandalism": "tier 2",
    "aggravated kidnapping": "tier 2",
    "animal cruelty": "tier 2",
    "obstructing witnesses and evidence": "tier 2",
    "hate crime acts": "tier 2",
    "firearms crimes": "tier 2",
    "use of fire or explosives to destroy property": "tier 2",
    "battery": "tier 3",
    "aggravated assault/battery": "tier 3",
    "petty theft": "tier 3",
    "grand theft": "tier 3",
    "trafficking in firearms": "tier 3",
    "running an illegal gambling business": "tier 3",
    "rape": "tier 3",
    "money laundering (international)": "tier 3",
    "possession with intent to distribute controlled substances": "tier 3",
    "major fraud against the us": "tier 3",
    "possession of marijuana": "tier 3",
    "cyber intrusion": "tier 3",
    "sexual battery": "tier 3",
    "child exploitation": "tier 3",
    "domestic violence assault": "tier 3",
    "international terrorism": "tier 3",
    "forgery": "tier 3",
    "bribery (financial)": "tier 3",
    "robbery": "tier 3",
    "mail fraud": "tier 3",
    "tampering with evidence": "tier 3",
    "insurance fraud": "tier 3",
    "bribery (non-financial)": "tier 3",
    "access devices fraud": "tier 3",
    "violation of the lacey act": "tier 3",
    "motor vehicle theft": "tier 3",
    "bank fraud": "tier 3",
    "engaging in a continuing criminal enterprise": "tier 3",
    "securities fraud": "tier 3",
    "murder": "tier 3",
    "mortgage fraud": "tier 3",
    "tax fraud": "tier 3",
    "arson": "tier 3",
    "simple assault": "tier 3",
    "animal abuse": "tier 3",
    "use, carry, or possession of firearm in furtherance of a drug felony or a federal crime of violence": "tier 3",
    "manslaughter": "tier 3",
    "wire fraud": "tier 3",
    "animal neglect": "tier 3",
    "resisting arrest": "tier 3",
    "promoting animal fighting ventures": "tier 3",
    "vehicle burglary": "tier 3",
    "illegal possession of firearms": "tier 3",
    "shoplifting": "tier 3",
    "illegal fishing": "tier 3",
    "possessing stolen property": "tier 3",
    "aggravated identity theft": "tier 3",
    "healthcare fraud": "tier 3",
    "check fraud": "tier 3",
    "receiving stolen property": "tier 3",
    "blackmail": "tier 3",
    "distribution of controlled substances": "tier 3",
    "child abduction": "tier 3",
    "first degree murder": "tier 4",
    "aggravated robbery": "tier 4",
    "accepting bribes (financial) as a governmental employee of the us": "tier 4",
    "counterfeiting": "tier 4",
    "voluntary manslaughter": "tier 4",
    "mail theft": "tier 4",
    "child sexual abuse": "tier 4",
    "kickback scheme (financial)": "tier 4",
    "malware": "tier 4",
    "bribery (financial) of a public official": "tier 4",
    "involuntary manslaughter": "tier 4",
    "assault with a special victim": "tier 4",
    "violation of the false claims act": "tier 4",
    "providing material support to terrorists": "tier 4",
    "dogfighting": "tier 4",
    "armed robbery": "tier 5",
    "terrorism financing": "tier 5",
    "sex trafficking of children": "tier 5",
    "elder abuse": "tier 5",
    "vehicular manslaughter": "tier 5",
    "child pornography": "tier 5",
    "ransomware": "tier 5",
    "child enticement": "tier 5"
}

#### (failed shap attempt)

In [None]:
import shap
import numpy as np

# def run_shap_for_selected_topics(model_store, selected_topics, X_val):
    """Run SHAP analysis on a subset of topics from the trained model.

    Args:
        model_store (dict): Dictionary containing trained models per tier.
        selected_topics (dict): Dictionary mapping selected topics to their tiers (e.g., {'topic1': 'tier 2', 'topic2': 'tier 3'}).
        X_val (pd.Series): Validation text data (raw text before vectorization).

    Returns:
        dict: SHAP values per topic.
    """
    # shap_results = {}

    # for topic, tier in selected_topics.items():
        # if tier not in model_store:
            # print(f"Tier '{tier}' not found in model store. Skipping topic '{topic}'.")
            # continue

        # for model_name, pipeline in model_store[tier].items():
            # Extract vectorizer and classifier
            # vectorizer = pipeline.named_steps['vectorizer']
            # classifier = pipeline.named_steps['classifier']

            # Ensure classifier is OneVsRestClassifier and extract the specific estimator for this topic
            # if isinstance(classifier, OneVsRestClassifier):
                # try:
                    # topic_idx = classifier.classes_.tolist().index(topic)
                    # topic_model = classifier.estimators_[topic_idx]
                # except ValueError:
                    # print(f"Topic '{topic}' not found in model '{model_name}' (Tier: {tier}).")
                    # continue
            # else:
                # print(f"Model '{model_name}' is not OneVsRestClassifier. Skipping topic '{topic}'.")
                # continue

            # Transform validation data
            # X_val_transformed = vectorizer.transform(X_val)

            # Run SHAP only for the selected topic's classifier
            # explainer = shap.Explainer(topic_model, X_val_transformed)
            # shap_values = explainer(X_val_transformed)

            # shap_results[(tier, topic, model_name)] = shap_values

            # print(f"SHAP computed for topic '{topic}' (Tier: {tier}, Model: {model_name}).")

    # return shap_results


## Get top 20 most important features for the selected topics

In [None]:
for topic, tier in selected_topics.items():
    print(f"Processing topic: {topic}, tier: {tier}")
    
    print("Available tiers in y_val_store:", list(y_val_store.keys()))

    # Debugging print
    print(f"DEBUG: y_val_store[{tier}] columns: {list(y_val.columns)}")

    y_val = y_val_store[tier]
    
    # Debugging print
    print(f"DEBUG: y_val_store[{tier}] columns: {list(y_val.columns)}")

    # Check if topic exists in y_val.columns
    if topic not in y_val.columns:
        print(f"Topic '{topic}' is not in the validation labels for tier '{tier}'. Skipping.")
        continue  # Skip if the topic isn't found in the validation data

    # Get the correct validation data and model for this topic
    X_val = X_val_store[tier]
    model = model_store[tier]['XGBoost']  # Or 'Decision Tree'

    # Transform text data using the model's vectorizer
    vectorizer = model.named_steps['vectorizer']
    X_val_transformed = vectorizer.transform(X_val).toarray()  # Convert to dense array

    # Ensure the order of topics in the validation set matches the order of classifiers in OneVsRestClassifier
    topic_index = list(y_val.columns).index(topic)  # Get the index of the topic in y_val

    # Access the classifier for this topic in the OneVsRestClassifier
    classifier = model.named_steps['classifier'].estimators_[topic_index]

    print(f"y_val_store for tier {tier}: {y_val.columns.tolist()}")

    # Get the feature importances (if available in the classifier)
    if hasattr(classifier, 'feature_importances_'):
        feature_importances = classifier.feature_importances_
    else:
        print(f"Error: The classifier for topic '{topic}' does not have 'feature_importances_'")
        continue  # Skip if the classifier doesn't have feature importances

    # Get the feature names from CountVectorizer
    feature_names = vectorizer.get_feature_names_out()

    # Create a DataFrame of feature importances
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    })

    # Sort the DataFrame by importance
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    
    # Display top N important features
    top_n = 20  # Adjust as needed
    print(f"Top {top_n} important features for topic {topic}")
    print(feature_importance_df.head(top_n))

    
    # Print unique feature names for debugging
    # print(f"Feature Names for {topic}: {vectorizer.get_feature_names_out()[:10]}")

    # Ensure SHAP handles multi-label classification (OneVsRest)
    # explainer = shap.Explainer(classifier, X_val_transformed)

    # Compute SHAP values
    # shap_values = explainer(X_val_transformed)

    # SHAP summary plot
    # shap.summary_plot(shap_values, X_val_transformed, feature_names=vectorizer.get_feature_names_out())


Processing topic: financial crimes, tier: tier 1
Available tiers in y_val_store: ['tier 1', 'tier 2', 'tier 3', 'tier 4', 'tier 5', 'tier 6', 'tier 7']
DEBUG: y_val_store[tier 1] columns: ['financial crimes', 'contempt of court', 'environmental crimes', 'criminal threats', 'traffic crimes', 'civil rights violation', 'violent crimes', 'criminal tresspassing', 'crimes against animals', 'making false statements', 'immigration offenses', 'corruption', 'record falsification', 'property damage', 'human trafficking', 'smuggling of goods', 'crimes against children', 'cybercrime', 'election crimes', 'perjury', 'organized crime', 'obstruction of justice', 'theft crimes', 'weapons crimes', 'sex crimes', 'providing or possessing contraband in prison', 'kidnapping', 'identity crimes', 'fraudulent/counterfeit goods', 'criminal harassment', 'forgery (non-financial/identity)', 'crimes against the government', 'drug crimes', 'disorderly conduct']
DEBUG: y_val_store[tier 1] columns: ['financial crimes',

# Streamlit App for Demo

## Store models for selected topics

In [None]:
import streamlit as st
import pickle
import numpy as np

# Load pre-trained models and vectorizers
def load_models(selected_topics, model_store):
    loaded_models = {}
    for topic, tier in selected_topics.items():
        if tier in model_store and 'XGBoost' in model_store[tier]:
            model = model_store[tier]['XGBoost']
            vectorizer = model.named_steps['vectorizer']
            classifier = model.named_steps['classifier']
            topic_index = list(y_val_store[tier].columns).index(topic)  # Get index for the topic
            loaded_models[topic] = (vectorizer, classifier.estimators_[topic_index])  # Store vectorizer & specific classifier
    return loaded_models

# Load only the models for selected topics
loaded_models = load_models(selected_topics, model_store)

import joblib
joblib.dump(loaded_models, "gamma_models_selected.pkl")


Saved files: ['all_mugshot_freq.csv', 'Discord.dmg', 'model_final.pkl', 'muc34 2', 'gamma_model3.db', 'googlechrome.dmg', 'traffic_strings_2.csv', 'merged_df.csv', '231221x.csv', 'Content Ingest Dataset Schema.docx', 'AllCodes012418.xlsx', 'beta_model8-2.ipynb', 'gamma_models_selected.pkl', 'topics_relabel_rules.ipynb', 'gamma_model7.db', 'train.jsonl', 'shapFP2.ipynb', 'labeled_data_for_tiers_0228', 'artifacts', 'RAMS-json-reader.ipynb', 'tier1_data.csv', 'HR-filtered.ipynb', 'labeled_tasks.jsonl.gz', 'gamma_model6.db', 'gamma_model.db', 'Untitled-1-2.ipynb', 'fbd_06062024_ranks.csv', '.DS_Store', 'labeled_data_109', 'day_1_input.txt', 'Fidelity Welfare SPD 2023.pdf', 'filterdicts-final.ipynb', 'beta_model_countvect.ipynb', 'result_tier_df.csv', 'Untitled-3.ipynb', 'gamma_model2.db', 'Untitled-1.ipynb', 'function_classifier_evaluation.ipynb', 'nonrelevant_freq.ipynb', 'mugshots_expanded_yp.csv', '.localized', 'Screenshot 2024-07-09 at 11.21.04â\x80¯AM.png', 'topicEDA.ipynb', 'mugshot

### Verify model structure

In [None]:
print(model_store.keys())  # Should list tiers
print(model_store['tier 1'].keys())  # Should include 'XGBoost'
print(model_store['tier 1']['XGBoost'])  # Should be a Pipeline object


dict_keys(['tier 1', 'tier 2', 'tier 3', 'tier 4', 'tier 5', 'tier 6', 'tier 7'])
dict_keys(['Decision Tree', 'XGBoost'])
Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier',
                 OneVsRestClassifier(estimator=XGBClassifier(base_score=None,
                                                             booster=None,
                                                             callbacks=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsample_bytree=None,
                                                             device=None,
                                                             early_stopping_rounds=None,
                                                             enable_categorical=False,
                                                             ev

In [None]:
loaded_models = joblib.load("gamma_models_selected.pkl")
print(type(loaded_models))  # Should be dict
print(type(next(iter(loaded_models.values()))))  # Should be tuple
print(next(iter(loaded_models.values())))  # Should be (vectorizer, classifier)

<class 'dict'>
<class 'tuple'>
(CountVectorizer(), XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...))


## Write Streamlit app

In [12]:
%%writefile app.py
import streamlit as st
import joblib
import shap
import matplotlib.pyplot as plt
import numpy as np

@st.cache_resource
def load_models():
    return joblib.load("gamma_models_selected.pkl")

loaded_models = load_models() # Load once and reuse

# Streamlit UI
st.title("Gamma Topic Prediction")

# User input
text_input = st.text_area("Enter text to classify:", "")

# Define a probability threshold (cutoff) for displaying topics
probability_cutoff = 0.2  # Adjust this threshold as needed

if st.button("Predict Topics"):
    if text_input.strip():
        predictions = {}
        shap_explanations = {}

        for topic, model_tuple in loaded_models.items():
            if not isinstance(model_tuple, tuple):
                print(f"Error: Model for topic {topic} is not a tuple! Got {type(model_tuple)}")
                continue

            vectorizer, classifier = model_tuple  # Unpack the tuple
            # Transform input text using the stored vectorizer
            X_input_transformed = vectorizer.transform([text_input])

            # Ensure format is consistent with model training
            if hasattr(classifier, "predict_proba"):  # Ensure the classifier has predict_proba
                # Get probability of topic presence
                prob = classifier.predict_proba(X_input_transformed)[0][1]  # Probability of topic presence
                if prob >= probability_cutoff:  # Apply threshold
                    predictions[topic] = prob

                    # SHAP explanation
                    explainer = shap.TreeExplainer(classifier)
                    shap_values = explainer.shap_values(X_input_transformed)[0]

                    # Get feature names
                    feature_names = vectorizer.get_feature_names_out()

                    # Extract only words from the input text
                    input_words = set(text_input.lower().split())  # Get unique words from input
                    word_contributions = {
                        feature_names[i]: shap_values[i]  # Shap value per word
                        for i in range(len(feature_names))
                        if feature_names[i] in input_words  # Keep only words from input text
                    }

                    shap_explanations[topic] = word_contributions

        # Display results
        st.subheader("Prediction Results:")
        if predictions:
            for topic, prob in sorted(predictions.items(), key=lambda x: x[1], reverse=True):
                st.write(f"**{topic}**: {prob:.4f}")

                # Sort and get the top 10 SHAP feature contributions
                sorted_shap = sorted(shap_explanations[topic].items(), key=lambda x: abs(x[1]), reverse=True)[:10]
                words, impacts = zip(*sorted_shap)

                # Matplotlib bar chart
                fig, ax = plt.subplots()
                ax.barh(words, impacts, color="skyblue")
                ax.set_xlabel("SHAP Value")
                ax.set_title(f"Top 10 Feature Contributions for {topic}")
                ax.invert_yaxis()  # Highest impact on top

                st.pyplot(fig)
                
        else:
            st.write("No topics met the probability cutoff.")

    else:
        st.warning("Please enter text before predicting.")

Overwriting app.py


# misc. code blocks (not being used)

#### (highlight max values code)

In [None]:
import pandas as pd

# Define function to highlight max values for each metric across models for each topic
def highlight_max(df, metric):
    """Highlight the maximum value for each topic across all models for a given metric."""
    # Select columns for the specified metric (Precision, Recall, or F1 Score)
    metric_columns = [col for col in df.columns if col[0] == metric]
    
    # Find the maximum value per topic for the given metric
    max_vals = df[metric_columns].max(axis=1)
    
    # Create an empty style dataframe with the same shape
    styles = pd.DataFrame('', index=df.index, columns=df.columns)
    
    # Loop through each topic and apply the highlighting to the max value
    for idx, row in df.iterrows():
        for col in metric_columns:
            if row[col] == max_vals[idx]:
                styles.at[idx, col] = 'background-color: yellow; color: black'  # Yellow background with black text
    
    return styles

# Apply the styling for Precision, Recall, and F1 Score
t1_df_styled = t1_df_metrics_sorted.style

# Iterate through each metric (Precision, Recall, F1 Score) and apply the highlight
for metric in ['Precision', 'Recall', 'F1 Score']:
    t1_df_styled = t1_df_styled.apply(highlight_max, metric=metric, axis=None)

# Display the styled DataFrame
t1_df_styled



Unnamed: 0_level_0,Precision,Precision,Precision,Precision,Recall,Recall,Recall,Recall,F1 Score,F1 Score,F1 Score,F1 Score
Unnamed: 0_level_1,Decision Tree,XGBoost,SVM,Random Forest,Decision Tree,XGBoost,SVM,Random Forest,Decision Tree,XGBoost,SVM,Random Forest
civil rights violation,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
contempt of court,0.677419,1.0,1.0,1.0,0.65625,0.8125,0.09375,0.53125,0.666667,0.896552,0.171429,0.693878
corruption,0.825581,0.971429,1.0,1.0,0.8875,0.85,0.0375,0.575,0.855422,0.906667,0.072289,0.730159
crimes against animals,0.781955,0.990566,1.0,0.962264,0.920354,0.929204,0.300885,0.902655,0.845528,0.958904,0.462585,0.931507
crimes against children,0.734463,0.911765,0.966667,0.989474,0.872483,0.832215,0.194631,0.630872,0.797546,0.870175,0.324022,0.770492
crimes against the government,0.4,1.0,1.0,1.0,0.222222,0.111111,0.0,0.222222,0.285714,0.2,0.0,0.363636
criminal harassment,0.909091,0.946667,1.0,0.981818,0.972222,0.986111,0.236111,0.75,0.939597,0.965986,0.382022,0.850394
criminal threats,0.898305,0.943396,1.0,1.0,0.768116,0.724638,0.014493,0.463768,0.828125,0.819672,0.028571,0.633663
criminal tresspassing,0.880435,0.89011,1.0,0.887097,0.870968,0.870968,0.150538,0.591398,0.875676,0.880435,0.261682,0.709677
cybercrime,0.745763,0.86,1.0,0.913043,0.676923,0.661538,0.015385,0.323077,0.709677,0.747826,0.030303,0.477273


#### (code to get avg metrics per model)

In [None]:
df_avg_metrics = df_metrics.mean().unstack(level=0)
df_avg_metrics

Unnamed: 0,Precision,Recall,F1 Score
Decision Tree,0.707088,0.74517,0.705126
XGBoost,0.896549,0.638549,0.686793
SVM,0.988388,0.112239,0.174514
Random Forest,0.973855,0.527907,0.628703


#### (code to get overall accuracy score per model)

In [None]:
from sklearn.metrics import accuracy_score

# Assuming you already have the 'trained_models' dictionary with fitted models
# and you have your validation data 'X_val' and true labels 'y_val'.

# Initialize an empty dictionary to store accuracy scores for each model
accuracy_results = {}

# Loop through the trained models and calculate accuracy
for model_name, model in trained_models.items():
    # Make predictions with the trained model on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate the accuracy score
    accuracy = accuracy_score(y_val, y_pred)
    
    # Store the accuracy result
    accuracy_results[model_name] = accuracy

# Print the accuracy scores for each model
for model_name, accuracy in accuracy_results.items():
    print(f"{model_name} Accuracy: {accuracy:.4f}")


Decision Tree Accuracy: 0.6507
XGBoost Accuracy: 0.7438
SVM Accuracy: 0.3025
Random Forest Accuracy: 0.6686


### Checking how Spacy is tagging ents

In [None]:
# Function to extract entities from text
# def extract_entities(text):
    # doc = nlp(text)
    # return [(ent.text, ent.label_) for ent in doc.ents]

# Apply function to the DataFrame
# sample_df["entities"] = sample_df["snippet"].apply(extract_entities)

# Display results
# sample_df[["snippet", "entities"]]
# pd.options.display.max_colwidth = 500
# sample_df

Unnamed: 0,snippet,entities
0,"Cybercrimes have become more frequent and sophisticated, and the tools | and variables associated with this type of evidence. They use this expertise to assist clients at all stages of litigation or investigation. Ultimately, legal teams dealing with digital evidence are best served by consulting with these kinds of experts throughout the process. About the Author: Amanda Rankhorn is a Senior Vice President with Kroll, Cyber Risk, and a retired Special Agent/Senior Forensic Examiner of the F...","[(Cybercrimes, ORG), (Rankhorn, ORG), (Kroll, PERSON), (Cyber Risk, PERSON), (the Federal Bureau of Investigation, ORG), (FBI, ORG)]"
1,"A Boynton Beach man who stabbed a co-worker to death during an argument over a tire valve stem was convicted on Monday of manslaughter. On Feb. 14, Johnny Claude Colson got into a fight with James Lawrence Morton, 33, when each wanted the valve stem as they worked at Mackey Used Parts Tire and Auto Service in the 700 block of Northeast Third Street in Boynton Beach. Morton, of the 100 block of Northeast Fourth Avenue, hit Colson in the","[(Boynton Beach, GPE), (Monday, DATE), (Feb. 14, DATE), (Johnny Claude Colson, PERSON), (James Lawrence Morton, PERSON), (33, DATE), (Mackey Used Parts Tire and Auto Service, ORG), (700, CARDINAL), (Northeast Third Street, ORG), (Boynton Beach, GPE), (Morton, PERSON), (100, CARDINAL), (Northeast Fourth Avenue, LOC), (Colson, PERSON)]"
2,"Five suspected thieves working in groups were arrested at the | under arrest without incident,"" said Sgt. Dan Marshall of the Indio police. Police allegedly recovered more than 40 cellphones from the two groups, several forms of personal identification, credit cards and cash. The first group of suspects included Angela Trivino, 35, of New York and Viviana Hernandez, 38, of Los Angeles. The second group consisted of Brenda Cansino, 29, of Miami; Sharon Ruiz, 25, of Van Nuys; and Marco Leon, 2...","[(Five, CARDINAL), (Sgt, PERSON), (Dan Marshall, PERSON), (Indio, NORP), (more than 40, CARDINAL), (two, CARDINAL), (first, ORDINAL), (Angela Trivino, PERSON), (35, DATE), (New York, GPE), (Viviana Hernandez, GPE), (38, DATE), (Los Angeles, GPE), (second, ORDINAL), (Brenda Cansino, PERSON), (29, DATE), (Miami, GPE), (Sharon Ruiz, PERSON), (25, DATE), (Van Nuys, GPE), (Marco Leon, PERSON), (27, DATE), (Los Angeles, GPE), (the Indio Jail, ORG)]"
3,"The bribery and campaign corruption case against two West Palm | told authorities that Butler and campaign manager Brabham promised to keep him out of prison if he bankrolled Butler’s campaign for office. Butler and Brabham have been charged with bribery, conspiracy to commit bribery and violating campaign finance laws. In a hearing before Circuit Judge Edward Garrison, Butler and Brabham plan to argue that the bribery statute is unconstitutionally vague because it does not adequately define...","[(two, CARDINAL), (West Palm, GPE), (Butler, ORG), (Brabham, PERSON), (Butler, ORG), (Butler, ORG), (Brabham, ORG), (Edward Garrison, PERSON), (Butler, ORG), (Brabham, ORG), (Butler, ORG)]"
4,"Tech-savvy fraudsters stealing from the government's Covid pandemic relief programs | appropriate for us to comment further on those specific safeguards."" E-Trade did not respond to multiple emails and calls. Other fraud Some fraudsters who use online investment platforms don't even bother to steal an identity. In a recent case in Seattle, prosecutors charged tech executive Mukund Mohan with receiving a total of $5.5 million in PPP funds by submitting fraudulent loan applications. Court fili...","[(Covid, PERSON), (Seattle, GPE), (Mukund Mohan, PERSON), ($5.5 million, MONEY), (PPP, ORG), (231,471, MONEY), (Mohan, GPE), (Robinhood, PRODUCT), (Mohan, PERSON), (LinkedIn, ORG)]"
5,"1 of 47 JOHN ALLEN DEALING IN STOLEN PROPERTY 2 | 47 CRYSTAL JENNINGS GRAND THEFT 26 of 47 SYLVESTER JOHNSON GRAND THEFT- MOTOR VEH. < $100,000 27 of 47 TAMMY LAWTON BATTERY 28 of 47 TIMOTHY LUDWIG FLEE/ELUDE LEO AT HIGH SPEED 29 of 47 LEE ANN MOREY GRAND THEFT (MOTOR VEHICLE) 30 of 47 TREMALE MORGAN ASSAULT CRIMINAL MISCHIEF DOMESTIC BATTERY BY STRANGULATION TAMPERING WITH A WITNESS 31 of 47 CYNTHIA MOYNIHAN AGGRAVATED BATTERY (DEADLY WEAPON) 32 of 47 DILLON PARKER PETIT THEFT FROM MERCHANT...","[(1, CARDINAL), (47, CARDINAL), (JOHN ALLEN DEALING IN, PERSON), (2, CARDINAL), (47, CARDINAL), (26, CARDINAL), (47, CARDINAL), (100,000, MONEY), (47, CARDINAL), (TAMMY, ORG), (28, CARDINAL), (47, CARDINAL), (29, CARDINAL), (47, CARDINAL), (LEE, PERSON), (ANN MOREY GRAND THEFT, PRODUCT), (30, CARDINAL), (47, CARDINAL), (47, CARDINAL), (CYNTHIA MOYNIHAN AGGRAVATED BATTERY, PERSON), (DEADLY WEAPON, ORG), (32, CARDINAL), (47, CARDINAL), (33, CARDINAL), (47, CARDINAL), (28 GRAMS-200 GRAMS, QUANT..."
6,"A Longmont woman is facing felony forgery and theft charges after allegedly stealing more than $20,000 from the company she worked for in Louisville. Jordan Danielle Baiza, 30, was arrested Wednesday on suspicion of forgery, theft between $20,000 to $100,000 and identity theft. Baiza admitted to stealing money while working as a bookkeeper for Columbine Plastics on Boxelder Street in Louisville, according to an arrest affidavit. She used a","[(Longmont, ORG), (more than $20,000, MONEY), (Louisville, GPE), (Jordan Danielle Baiza, PERSON), (30, DATE), (Wednesday, DATE), (between $20,000 to $100,000, MONEY), (Baiza, PERSON), (Columbine Plastics, ORG), (Boxelder Street, FAC), (Louisville, GPE)]"
7,"A key player in a $13 million international fraud scam | fraud, wire fraud and conspiracy to launder money. James Leonard Smith, 54, of Midlothian, a co-defendant, is set to be sentenced on May 27. Johnson was sentenced Friday by U.S. District Judge Henry E. Hudson. In December, Hudson sentenced a third man in the scheme, Stuart Jay Anderson, 52, a lawyer from Aliso Viejo, Calif., to four years and ordered him to pay $5,715,578.21 in restitution. A fourth man indicted in the case, Brian Mich...","[($13 million, MONEY), (James Leonard Smith, PERSON), (54, DATE), (Midlothian, GPE), (May 27, DATE), (Johnson, PERSON), (Friday, DATE), (U.S., GPE), (Henry E. Hudson, PERSON), (December, DATE), (Hudson, ORG), (third, ORDINAL), (Stuart Jay Anderson, PERSON), (52, DATE), (Viejo, PERSON), (Calif., GPE), (four years, DATE), (5,715,578.21, MONEY), (fourth, ORDINAL), (Brian Michael Bridge, PERSON), (London, GPE), (Chimera Group Ltd., ORG)]"
8,"A small yet growing number of business owners are facing | was charged last month with bank fraud and making false statements to a financial institution after submitting PPP loan applications on behalf of several different companies he owned. Authorities allege Hines was awarded $3.9 million, some of which he used to buy a Lamborghini. - Carlos Belone of Coconut Creek, Florida, was charged with wire fraud and conspiracy to commit health care fraud. Belone allegedly obtained $22,000 in PPP lo...","[(last month, DATE), (PPP, ORG), (Hines, ORG), ($3.9 million, MONEY), (Lamborghini, NORP), (Coconut Creek, GPE), (Florida, GPE), (22,000, MONEY), (PPP, ORG), (Massachusetts, GPE)]"
9,"24 Boston Gang Members and Associates Charged with Cocaine Trafficking | departments, today’s arrests would not have been possible. Feel free to thank them for what they do.” “Drug trafficking, along with the violence that all too often accompanies it, is a serious threat to the safety and security of our communities,” said Acting DEA Administrator Timothy J. Shea. “Drug dealers and street gangs value their own profits over human life, and are responsible for fueling drug addiction and much ...","[(24, CARDINAL), (Cocaine, ORG), (today, DATE), (DEA, ORG), (Timothy J. Shea, PERSON), (America, GPE), (DEA, ORG), (Rachael Rollins, PERSON), (Norfolk County District, GPE), (Michael Morrissey, PERSON), (Bristol County District, GPE), (Thomas Quinn III, PERSON), (Suffolk County, GPE), (Steven W. Tompkins, PERSON), (Plymouth County, GPE), (Joseph D. McDonald Jr., PERSON), (Norfolk County, FAC), (Jerome P. McDermott, PERSON), (today, DATE), (U.S., GPE), (Kaitlin, PERSON), (Timothy Moran, PERSO..."


In [None]:
# Define features (X) and multi-label targets (y)
X = data["clean_snippet"]
y=data.iloc[:,3:] # All topic columns (binary indicators)

# Display dataset structure
print(f"Feature shape: {X.shape}, Target shape: {y.shape}")
print(y.head())

Feature shape: (10362,), Target shape: (10362, 44)
   smuggling of goods  weapons crimes  \
0                 0.0             0.0   
1                 0.0             0.0   
2                 0.0             0.0   
3                 0.0             0.0   
4                 0.0             0.0   

   providing or possessing contraband in prison  corruption  \
0                                           0.0         0.0   
1                                           0.0         0.0   
2                                           0.0         0.0   
3                                           0.0         1.0   
4                                           0.0         0.0   

   food stamp trafficking  cybercrime  property damage  \
0                     0.0         0.0              0.0   
1                     0.0         0.0              0.0   
2                     0.0         0.0              0.0   
3                     0.0         0.0              0.0   
4                     0.0        