In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from bs4 import BeautifulSoup
def extract_domains(file_name: str):
    # Open the file in read mode
    with open(file_name, "r") as f:
        # Read lines directly from the file
        soup = f.read()
    bs = BeautifulSoup(soup,"html.parser")
    # Find all elements with the specified class
    elements = bs.find_all(class_="blob-code blob-code-inner js-file-line")

    # Extract the text from each element
    data = [element.get_text(strip=True) for element in elements]

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['domains'])
    return df
# https://data.mendeley.com/datasets/gwnwdgzts3/1
safe_domains = extract_domains("/kaggle/input/domains-list/clean-alexa-32k.txt")

In [None]:
malicious_domains = pd.read_csv('/kaggle/input/domains-list/malicious-domains-20240718.txt', header = None, names = ['domains'])

In [None]:
sub_safe_domains = safe_domains.sample(150).reset_index(drop = True)
sub_malicious_domains = malicious_domains.sample(150).reset_index(drop = True)

**AlienVault**

In [None]:
# whois only
# https://otx.alienvault.com/assets/static/external_api.html#
import requests
def get_domain_data(api_key, domain, section):
    """
    Fetch data for a given domain using the AlienVault API.

    Parameters:
        api_key (str): Your AlienVault API key.
        domain (str): The domain to fetch data for.

    Returns:
        dict: A dictionary containing the domain data if successful, otherwise None.
    """
    # Define the base URL for the AlienVault API
    url = f"https://otx.alienvault.com/api/v1/indicators/domain/{domain}/{section}"
    headers = {
        'X-OTX-API-KEY': api_key  # Include your API key in the request headers
    }
    
    try:
        # Make a GET request to the API
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses
        return response.json()  # Return the JSON data if successful
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for domain {domain}: {e}")
        return None  # Return None if there was an error

def fetch_data_for_domains(domains, api_key, section):
    """
    Fetch data for a list of domains and return a DataFrame containing the results.

    Parameters:
        domains (list): A list of domain names to fetch data for.
        api_key (str): Your AlienVault API key.

    Returns:
        DataFrame: A Pandas DataFrame containing the domain data.
    """
    data_dict = {}  # List to store the domain data
    for domain in domains:
        domain_data = get_domain_data(api_key, domain, section)  # Fetch data for each domain
        if domain_data:
            data_dict[domain] = domain_data  # Add the data to the dictionary if it was successfully fetched
    
    # Create a DataFrame from the data list
    df = pd.DataFrame.from_dict(data_dict).T
    return df  # Return the DataFrame

Indicator page api for domain names

Example domains: 'rghost.net', 'spywaresite.info'

sections:

* general: General information about the domain, including any pulses, and a list of the other sections currently available for this domain.
* geo: A more verbose listing of geographic data (Country code, coordinates, etc.)
* malware: Malware samples analyzed by LevelBlue Labs which have been observed connecting to this domain.
* url_list: URLs analyzed by LevelBlue Labs on this domain.
* passive_dns: Passive dns records observed by LevelBlue Labs pointing to this domain.
* whois: Whois records for the domain.
* http_scans: Meta data for http(s) connections to the domain.

In [None]:
import json

def json_to_string(json_obj):
    """
    Convert a JSON object or a list of JSON objects to a string representation of key-value pairs.
    
    :param json_obj: A JSON object, a list of JSON objects, or a string representation of either
    :return: A string representation of the JSON object(s) key-value pairs
    """
    
    # If the input is a string, try to parse it as JSON
    if isinstance(json_obj, str):
        try:
            json_obj = json.loads(json_obj)
            print("Successfully parsed JSON string")
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON string: {e}")
            return "Invalid JSON string"
    
    # If it's a list, process each item in the list
    if isinstance(json_obj, list):
        return '; '.join([process_dict(item) if isinstance(item, dict) else str(item) for item in json_obj])
    
    # If it's a dictionary, process it
    elif isinstance(json_obj, dict):
        return process_dict(json_obj)
    
    # If it's neither a list nor a dictionary, return an error message
    else:
        print(f"Input is not a list or dictionary")
        return "Input is not a valid JSON object or list of objects"

def process_dict(d):
    """Helper function to process a single dictionary, extracting only 'key' and 'value'"""
    if 'key' in d and 'value' in d:
        return f"{d['key']}: {d['value']}"
    else:
        return "Invalid dictionary format"


In [None]:
api_key = 'AlienVaultKey'  # Replace with your AlienVault API key
section = 'whois'

def process_domains(domains_df, label, api_key, section):
    """
    Process a DataFrame of domains, fetch data for them, and prepare the result DataFrame.

    :param domains_df: DataFrame containing domains
    :param label: Label for the domains ('safe' or 'malicious')
    :param api_key: API key for fetching data
    :param section: Section to fetch data from
    :return: Processed DataFrame
    """
    domains_list = domains_df['domains'].tolist()
    
    # Fetch data for the specified domains
    df = fetch_data_for_domains(domains_list, api_key, section)
    
    # Process the fetched data
    df = df.reset_index(names='domain')
    df['text'] = df['data'].apply(json_to_string)
    df['label'] = label
    
    return df

# Process safe domains
safe_df = process_domains(sub_safe_domains, 'safe', api_key, section)

# Process malicious domains
malicious_df = process_domains(sub_malicious_domains, 'malicious', api_key, section)

# Combine safe and malicious DataFrames if needed
combined_df = pd.concat([safe_df.drop(['data', 'count','related'], axis = 1), malicious_df.drop(['data', 'count','related'], axis = 1)], ignore_index=True)




In [None]:
combined_df1 = combined_df[combined_df['text'] != '']

In [None]:
combined_df1['label'].value_counts()

In [None]:
combined_df1

In [None]:
# Save the DataFrame to a CSV file
combined_df1.to_csv('/kaggle/working/combined_data.csv', index=False)