In [11]:
from scipy.io import arff
import numpy as np
import json
from sklearn.model_selection import train_test_split, KFold

In [13]:
# Load the ARFF dataset
with open('/content/sample_data/dataset.arff', 'r') as f:
    phish_df, meta = arff.loadarff(f)

# Convert the dataset to a NumPy array (if needed)
data = np.array(phish_df)

# Print the data points
print(data)

[(b'-1', b'1', b'1', b'1', b'-1', b'-1', b'-1', b'-1', b'-1', b'1', b'1', b'-1', b'1', b'-1', b'1', b'-1', b'-1', b'-1', b'0', b'1', b'1', b'1', b'1', b'-1', b'-1', b'-1', b'-1', b'1', b'1', b'-1', b'-1')
 (b'1', b'1', b'1', b'1', b'1', b'-1', b'0', b'1', b'-1', b'1', b'1', b'-1', b'1', b'0', b'-1', b'-1', b'1', b'1', b'0', b'1', b'1', b'1', b'1', b'-1', b'-1', b'0', b'-1', b'1', b'1', b'1', b'-1')
 (b'1', b'0', b'1', b'1', b'1', b'-1', b'-1', b'-1', b'-1', b'1', b'1', b'-1', b'1', b'0', b'-1', b'-1', b'-1', b'-1', b'0', b'1', b'1', b'1', b'1', b'1', b'-1', b'1', b'-1', b'1', b'0', b'-1', b'-1')
 ...
 (b'1', b'-1', b'1', b'1', b'1', b'-1', b'1', b'-1', b'-1', b'1', b'1', b'1', b'1', b'0', b'-1', b'-1', b'1', b'1', b'0', b'1', b'1', b'1', b'1', b'1', b'1', b'1', b'-1', b'1', b'0', b'1', b'-1')
 (b'-1', b'-1', b'1', b'1', b'1', b'-1', b'-1', b'-1', b'1', b'-1', b'1', b'1', b'-1', b'-1', b'1', b'-1', b'1', b'1', b'0', b'-1', b'1', b'-1', b'1', b'1', b'1', b'1', b'-1', b'1', b'1', b'1', b'

In [15]:
# Print the number of datapoints and features
print('The dataset has {0} datapoints with {1} features'.format(data.shape[0], len(data.dtype) - 1))

# Print the feature names from the metadata
print('Features: {0}'.format(meta.names()[:-1]))


The dataset has 11055 datapoints with 30 features
Features: ['having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report']


In [19]:
# Separate the features (X) and target (y)
X = data[list(meta.names()[:-1])]  # All columns except the last (target column)
y = data[meta.names()[-1]]         # The last column is the target

# Reshape y to be a 1D array
y = y.reshape(-1)

# Print the shapes before splitting
print('Before splitting')
print('X: {0}, y: {1}'.format(X.shape, y.shape))

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Print the shapes after splitting
print('After splitting')
print('X_train: {0}, y_train: {1}, X_test: {2}, y_test: {3}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))


Before splitting
X: (11055,), y: (11055,)
After splitting
X_train: (7738,), y_train: (7738,), X_test: (3317,), y_test: (3317,)


In [20]:
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)
print('Saved!')

Saved!


In [26]:
# Function to convert data to JSON serializable format
def convert_to_serializable(data):
    serializable_data = []
    for row in data:
        # Convert each row, handling bytes by decoding to string
        new_row = [str(item, 'utf-8') if isinstance(item, bytes) else item for item in row]
        serializable_data.append(new_row)
    return serializable_data

# Prepare test data dictionary
test_data = dict()
test_data['X_test'] = convert_to_serializable(X_test)  # Convert X_test to a JSON-serializable format
test_data['y_test'] = [str(item, 'utf-8') if isinstance(item, bytes) else item for item in y_test.tolist()]  # Convert y_test

# Write to JSON file
with open('/content/sample_data/testdata.json', 'w') as tdfile:
    json.dump(test_data, tdfile)
    print('Test Data written to testdata.json')


Test Data written to testdata.json
