conda install -c conda-forge liac-arff
pip install scipy

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import arff

#sssssssssssss

# load the ARFF file
with open('Data\Training_Dataset.arff', 'r') as f:
    data, meta = arff.loadarff(f)

# convert to list of lists
data_list = data.tolist()

# convert to pandas DataFrame
df = pd.DataFrame(data_list)

# add attribute names as columns
df.columns = meta.names()

# iterate over the DataFrame and convert each value to a string
for col in df.columns:
    df[col] = df[col].apply(lambda x: x.decode() if type(x) == bytes else x)

# save the DataFrame to a new csv file
df.to_csv('Data/new_file.csv', index=True)

# split the dataframe into train and test sets
train_df, test_df = train_test_split(df, test_size=0.25, random_state=2023)

# drop the 'result' column from test_df
test_df = test_df.drop('Result', axis=1)

# save the train and test sets to new csv files
train_df.to_csv('Data/train.csv', index=True)
test_df.to_csv('Data/test.csv', index=True)

# count how many records are there in DataFrame
num_rows = df.shape[0]
# count how many records are there in train DataFrame
num_rows_train = train_df.shape[0]

# count how many records are there in DataFrame
num_rows_test = test_df.shape[0]

print('Train df size is :' , num_rows_train)
print('Test df size is :' , num_rows_test)
print('Whole df size is :' ,num_rows)
print('Test df + Train df size is :' , num_rows_train + num_rows_test) 



Train df size is : 8291
Test df size is : 2764
Whole df size is : 11055
Test df + Train df size is : 11055


In [3]:
# get a list of all the column names
attributes_test = test_df.columns.tolist()

# print the list of column names
print(attributes_test)

['having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report']


In [4]:
# get a list of all the column names
attributes_train = train_df.columns.tolist()

# print the list of column names
print(attributes_train)

['having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report', 'Result']


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2764 entries, 8469 to 10315
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   having_IP_Address            2764 non-null   object
 1   URL_Length                   2764 non-null   object
 2   Shortining_Service           2764 non-null   object
 3   having_At_Symbol             2764 non-null   object
 4   double_slash_redirecting     2764 non-null   object
 5   Prefix_Suffix                2764 non-null   object
 6   having_Sub_Domain            2764 non-null   object
 7   SSLfinal_State               2764 non-null   object
 8   Domain_registeration_length  2764 non-null   object
 9   Favicon                      2764 non-null   object
 10  port                         2764 non-null   object
 11  HTTPS_token                  2764 non-null   object
 12  Request_URL                  2764 non-null   object
 13  URL_of_Anchor                

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8291 entries, 9534 to 4951
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   having_IP_Address            8291 non-null   object
 1   URL_Length                   8291 non-null   object
 2   Shortining_Service           8291 non-null   object
 3   having_At_Symbol             8291 non-null   object
 4   double_slash_redirecting     8291 non-null   object
 5   Prefix_Suffix                8291 non-null   object
 6   having_Sub_Domain            8291 non-null   object
 7   SSLfinal_State               8291 non-null   object
 8   Domain_registeration_length  8291 non-null   object
 9   Favicon                      8291 non-null   object
 10  port                         8291 non-null   object
 11  HTTPS_token                  8291 non-null   object
 12  Request_URL                  8291 non-null   object
 13  URL_of_Anchor                8

In [7]:
train_df.head(10)

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
9534,1,-1,1,-1,1,-1,1,1,-1,-1,...,-1,1,1,1,1,-1,1,0,1,1
7624,1,1,1,1,1,1,-1,1,-1,1,...,1,1,-1,1,1,1,1,0,1,1
10634,-1,0,1,-1,1,-1,1,0,1,1,...,1,1,1,1,0,-1,1,1,1,-1
969,-1,1,-1,1,-1,1,1,1,-1,1,...,1,1,1,-1,1,-1,1,1,1,1
10857,1,-1,1,1,1,-1,0,1,-1,1,...,1,1,1,1,1,-1,1,0,1,1
6966,-1,1,-1,1,-1,-1,0,-1,-1,1,...,1,1,1,-1,1,1,-1,1,1,-1
4865,1,-1,1,1,1,-1,1,-1,1,1,...,1,1,1,-1,-1,-1,1,1,1,-1
6252,-1,-1,1,1,1,-1,1,0,1,1,...,1,1,-1,1,0,-1,1,1,1,-1
536,1,-1,1,1,1,-1,0,-1,-1,1,...,1,1,1,1,1,1,1,0,1,1
7312,1,0,-1,-1,1,-1,-1,1,-1,1,...,-1,1,-1,1,1,-1,1,-1,1,1


In [8]:
test_df.head(10)

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
8469,-1,1,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,-1,-1,1,1,1,1
8650,1,-1,-1,1,-1,-1,1,-1,-1,1,...,1,1,1,1,-1,1,-1,1,1,1
8026,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,...,1,-1,1,-1,-1,-1,-1,-1,1,1
3843,1,1,1,1,1,-1,1,1,1,-1,...,1,-1,1,1,-1,1,-1,-1,0,1
7242,-1,-1,1,1,1,-1,0,-1,-1,-1,...,1,-1,-1,-1,1,1,1,1,-1,1
640,-1,-1,-1,1,-1,-1,0,-1,1,1,...,1,1,1,-1,1,-1,-1,1,0,1
5492,-1,-1,1,1,1,1,-1,-1,-1,1,...,1,1,1,-1,1,1,-1,1,1,1
10667,1,-1,1,1,1,-1,0,1,-1,1,...,1,1,1,1,1,1,-1,1,0,1
8921,1,-1,1,1,1,-1,1,1,1,1,...,1,1,1,-1,1,0,-1,1,0,1
9905,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,...,1,-1,1,1,-1,0,-1,1,1,1
