In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset?dataset_version_number=1...


100%|██████████| 211k/211k [00:00<00:00, 627kB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1





In [3]:
import kagglehub
import pandas as pd

# Download dataset
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

# Load dataset
df = pd.read_csv(path + "/spam.csv", encoding="ISO-8859-1")

# Keep only required columns
df = df.iloc[:, :2]
df.columns = ["label", "message"]

# Drop missing values & duplicates
df = df.dropna().drop_duplicates()

# Display first few rows
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Calculate message lengths
df["msg_length"] = df["message"].apply(len)

# IQR Method for Outlier Removal
q1, q3 = df["msg_length"].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound, upper_bound = q1 - 1.5 * iqr, q3 + 1.5 * iqr

# Filter messages within length range
df = df[(df["msg_length"] >= lower_bound) & (df["msg_length"] <= upper_bound)]

# Drop the length column
df = df.drop(columns=["msg_length"])

# Display cleaned dataset
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
import re

# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.strip()  # Remove extra spaces
    return text

# Apply preprocessing
df["cleaned_message"] = df["message"].apply(preprocess_text)

# Display cleaned messages
df.head()


Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...


In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
embeddings = model.encode(df["cleaned_message"].tolist(), convert_to_numpy=True)

# Save embeddings as a NumPy file
np.save("sms_embeddings.npy", embeddings)

# Print embedding shape
print("Embeddings shape:", embeddings.shape)  # (num_samples, embedding_dim)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings shape: (5103, 384)


In [8]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [9]:
import faiss

# Define embedding dimension
embedding_dim = embeddings.shape[1]

# Create FAISS L2 index
index = faiss.IndexFlatL2(embedding_dim)

# Add embeddings to index
index.add(embeddings)

# Save FAISS index to disk
faiss.write_index(index, "sms_faiss_index.bin")

print("Stored", index.ntotal, "embeddings in FAISS.")


Stored 5103 embeddings in FAISS.


In [10]:
# Function to perform similarity search
def search_similar_messages(query_text, top_k=5):
    query_embedding = model.encode([preprocess_text(query_text)])
    distances, indices = index.search(query_embedding, k=top_k)

    print("\n🔹 Top 5 Similar Messages:")
    for idx, distance in zip(indices[0], distances[0]):
        print(f"📌 Message: {df.iloc[idx]['message']}")
        print(f"   🔹 Distance: {distance}\n")

# Example query
search_similar_messages("Congratulations! You've won a free prize.")



🔹 Top 5 Similar Messages:
📌 Message: Congratulations YOU'VE Won. You're a Winner in our August å£1000 Prize Draw. Call 09066660100 NOW. Prize Code 2309.
   🔹 Distance: 0.7145818471908569

📌 Message: You have won a guaranteed å£200 award or even å£1000 cashto claim UR award call free on 08000407165 (18+) 2 stop getstop on 88222 PHP
   🔹 Distance: 0.7666818499565125

📌 Message: You have won a guaranteed å£200 award or even å£1000 cashto claim UR award call free on 08000407165 (18+) 2 stop getstop on 88222 PHP. RG21 4JX
   🔹 Distance: 0.7864023447036743

📌 Message: Congratulations ur awarded either å£500 of CD gift vouchers & Free entry 2 our å£100 weekly draw txt MUSIC to 87066 TnCs www.Ldew.com 1 win150ppmx3age16
   🔹 Distance: 0.8074848651885986

📌 Message: IMPORTANT INFORMATION 4 ORANGE USER 0796XXXXXX. TODAY IS UR LUCKY DAY!2 FIND OUT WHY LOG ONTO http://www.urawinner.com THERE'S A FANTASTIC PRIZEAWAITING YOU!
   🔹 Distance: 0.8156170845031738



In [13]:
# Install the libraries
!pip install crewai
!pip install langchain
!pip install groq


Collecting crewai
  Downloading crewai-0.105.0-py3-none-any.whl.metadata (28 kB)
Collecting appdirs>=1.4.4 (from crewai)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting auth0-python>=4.7.1 (from crewai)
  Downloading auth0_python-4.8.1-py3-none-any.whl.metadata (9.0 kB)
Collecting chromadb>=0.5.23 (from crewai)
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting instructor>=1.3.3 (from crewai)
  Downloading instructor-1.7.4-py3-none-any.whl.metadata (19 kB)
Collecting json-repair>=0.25.2 (from crewai)
  Downloading json_repair-0.39.1-py3-none-any.whl.metadata (11 kB)
Collecting json5>=0.10.0 (from crewai)
  Downloading json5-0.10.0-py3-none-any.whl.metadata (34 kB)
Collecting jsonref>=1.1.0 (from crewai)
  Downloading jsonref-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting litellm==1.60.2 (from crewai)
  Downloading litellm-1.60.2-py3-none-any.whl.metadata (36 kB)
Collecting opentelemetry-api>=1.22.0 (from crewai)
  Downloading o

In [15]:
!pip install crewai_tools --quiet


[31mERROR: Operation cancelled by user[0m[31m
[0m

In [19]:
import os
import time
import litellm
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
from crewai import Agent, Task, Crew, Process, LLM

# Bypass OpenAI API Key Check (CrewAI sometimes requires it)
os.environ["OPENAI_API_KEY"] = "dummy_key"

# Initialize LLM
api_key = "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"  # Replace with your actual key
llm = LLM(model="groq/mixtral-8x7b-32768", api_key=api_key)

# Load model and FAISS index - you already have these
model = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index("sms_faiss_index.bin")


# Use your existing preprocess_text function
def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    return text

# Use your existing search function
def search_similar_messages(query_text, top_k=5):
    query_embedding = model.encode([preprocess_text(query_text)])
    distances, indices = index.search(query_embedding, k=top_k)

    similar_messages = []
    for idx, distance in zip(indices[0], distances[0]):
        similar_messages.append({
            "message": df.iloc[idx]['message'],
            "label": df.iloc[idx]['label'],
            "distance": float(distance)
        })

    return similar_messages

# Create CrewAI Agents
fraud_analyst = Agent(
    role="SMS Fraud Analyst",
    goal="Analyze messages to determine if they're fraudulent or spam",
    backstory="""You are an expert in detecting fraudulent SMS messages.
    You've analyzed thousands of spam and legitimate messages and can
    identify subtle patterns that indicate fraud attempts.""",
    llm=llm,
    verbose=True
)

pattern_expert = Agent(
    role="Linguistic Pattern Expert",
    goal="Identify linguistic red flags in potentially fraudulent messages",
    backstory="""You specialize in recognizing deceptive language patterns
    in text messages. You can detect emotional manipulation, urgency tactics,
    too-good-to-be-true offers, and inconsistencies that indicate fraud.""",
    llm=llm,
    verbose=True
)

# Function to create tasks for the agents
def create_fraud_analysis_task(query, similar_messages):
    formatted_messages = "\n\n".join([
        f"Message: {msg['message']}\nLabel: {msg['label']}\nSimilarity: {msg['distance']:.4f}"
        for msg in similar_messages
    ])

    return Task(
        description=f"""
        Analyze if this message is fraudulent:

        USER MESSAGE: {query}

        SIMILAR MESSAGES FROM DATABASE:
        {formatted_messages}

        Provide a fraud analysis with:
        1. Content analysis for suspicious elements
        2. Comparison to known spam/ham patterns
        3. Clear verdict (FRAUD or LEGITIMATE) with explanation
        """,
        expected_output="A detailed fraud analysis with verdict",
        agent=fraud_analyst
    )

def create_pattern_analysis_task(query, similar_messages):
    formatted_messages = "\n\n".join([
        f"Message: {msg['message']}\nLabel: {msg['label']}\nSimilarity: {msg['distance']:.4f}"
        for msg in similar_messages
    ])

    return Task(
        description=f"""
        Identify linguistic patterns that may indicate fraud:

        USER MESSAGE: {query}

        SIMILAR MESSAGES FROM DATABASE:
        {formatted_messages}

        Analyze for:
        1. Urgency tactics and pressure
        2. Emotional manipulation
        3. Grammatical red flags
        4. Information gaps or vagueness
        """,
        expected_output="A linguistic pattern analysis",
        agent=pattern_expert
    )

# Function to handle rate limits
def execute_with_retry(crew, retries=3):
    for attempt in range(retries):
        try:
            return crew.kickoff()
        except litellm.RateLimitError:
            wait_time = 10
            print(f"Rate limit exceeded. Waiting {wait_time} seconds...")
            time.sleep(wait_time)
    print("Too many retries. Please try again later.")
    return None

# Main fraud detection function
def detect_sms_fraud(user_query):
    # Get similar messages using your existing FAISS index
    similar_messages = search_similar_messages(user_query)

    # Display similar messages
    print("\nTop 5 Similar Messages:")
    for i, msg in enumerate(similar_messages):
        print(f"Message {i+1}: {msg['message']}")
        print(f"Label: {msg['label']}")
        print(f"Similarity: {msg['distance']:.4f}\n")

    # Create tasks
    analysis_task = create_fraud_analysis_task(user_query, similar_messages)
    pattern_task = create_pattern_analysis_task(user_query, similar_messages)

    # Create and run the crew
    crew = Crew(
        agents=[fraud_analyst, pattern_expert],
        tasks=[analysis_task, pattern_task],
        process=Process.sequential,
        verbose=True
    )

    # Execute with retry logic
    result = execute_with_retry(crew)

    return result

# Main execution block
if __name__ == "__main__":
    print("SMS Fraud Detection System")
    print("==========================")

    while True:
        user_input = input("\nEnter an SMS message to analyze (or 'quit' to exit): ")
        if user_input.lower() == 'quit':
            break

        print("\nAnalyzing message...")
        result = detect_sms_fraud(user_input)

        if result:
            print("\nFRAUD ANALYSIS RESULTS:")
            print("=======================")
            print(result)

SMS Fraud Detection System

Enter an SMS message to analyze (or 'quit' to exit): hi,you won 50,000 dollars!

Analyzing message...

Top 5 Similar Messages:
Message 1: You have won ?1,000 cash or a ?2,000 prize! To claim, call09050000327
Label: spam
Similarity: 0.9290

Message 2: Win a å£1000 cash prize or a prize worth å£5000
Label: spam
Similarity: 0.9470

Message 3: You have won ?1,000 cash or a ?2,000 prize! To claim, call09050000327. T&C: RSTM, SW7 3SS. 150ppm
Label: spam
Similarity: 1.0055

Message 4: Call from 08702490080 - tells u 2 call 09066358152 to claim å£5000 prize. U have 2 enter all ur mobile & personal details @ the prompts. Careful!
Label: spam
Similarity: 1.0118

Message 5: Hi..i got the money da:)
Label: ham
Similarity: 1.0171

[1m[94m 
[2025-03-14 16:15:33][🚀 CREW 'CREW' STARTED, B848EB1A-B905-43D4-9DCC-7972F63F7CAB]: 2025-03-14 16:15:33.692469[00m
[1m[94m 
[2025-03-14 16:15:33][📋 TASK STARTED: 
        ANALYZE IF THIS MESSAGE IS FRAUDULENT:
        
        USE