In [1]:
import numpy as np
import pandas as pd
from urllib.parse import urlparse

In [2]:
clean_136k = pd.read_csv('/content/clean_136k.csv')
shuffled_136k= pd.read_csv('/content/shuffled_136k.csv')
df= pd.concat([shuffled_136k['URL'],clean_136k], axis=1)

In [3]:
length_stats = df.groupby('label')['URL_Length'].describe()
print(length_stats)

         count       mean         std   min   25%   50%   75%      max
label                                                                 
0      68000.0  13.795368    4.402715   4.0  11.0  13.0  16.0     64.0
1      68000.0  55.083118  152.455061  12.0  26.0  35.0  51.0  25523.0


In [4]:
print(df.columns)

Index(['URL', 'URL_Length', 'Url_Shortening', 'Num_Languages',
       'User_Info_Length', 'Num_Subdomains', 'Num_Digits_Subdomain',
       'Num_Hyphens_Subdomain', 'Length_Subdomain', 'Num_Digits_Domain',
       'Num_Hyphens_Domain', 'Length_Domain', 'Num_Dots_Domain', 'TLD',
       'Num_Digits_Path', 'Num_Hyphens_Path', 'Length_Path', 'Num_Dash_Path',
       'Num_Underscore_Path', 'Num_Percent_Path', 'Num_DoubleSlash_Path',
       'Hex_Encoded_Characters_Path', 'Num_Digits_Query', 'Num_Hyphens_Query',
       'Length_Query', 'Num_Query_Params', 'Num_Dots_Query', 'Num_Dash_Query',
       'Num_Underscore_Query', 'Num_Percent_Query', 'Num_Ampersand_Query',
       'Hex_Encoded_Characters_Query', 'label'],
      dtype='object')


In [5]:
convert_to_per={'Length_Subdomain':['Num_Digits_Subdomain','Num_Hyphens_Subdomain'],
 'Length_Domain':['Num_Digits_Domain','Num_Hyphens_Domain','Num_Dots_Domain'],
 'Length_Path':['Num_Digits_Path','Num_Hyphens_Path','Num_Dash_Path','Num_Underscore_Path','Num_Percent_Path','Num_DoubleSlash_Path','Hex_Encoded_Characters_Path'],
 'Length_Query':['Num_Digits_Query','Num_Hyphens_Query','Num_Dots_Query','Num_Dash_Query','Num_Underscore_Query','Num_Percent_Query','Num_Ampersand_Query','Hex_Encoded_Characters_Query']}

In [6]:
for key_col, part_cols in convert_to_per.items():
    for col in part_cols:
        new_col = 'percent_' + col.split('_', 1)[-1]
        df[new_col] = np.where(df[key_col] == 0, 0, df[col] / df[key_col])

        df.drop(columns=col, inplace=True)

    df.drop(columns=key_col, inplace=True)

In [7]:
def shannon_entropy(s):
    if not s:
        return 0.0
    probs = [s.count(c) / len(s) for c in set(s)]
    return -sum(p * np.log2(p) for p in probs)

def extract_parts(url):
    try:
        parsed = urlparse(url)
        hostname = parsed.hostname or ''
        parts = hostname.split('.')
        subdomain = '.'.join(parts[:-2]) if len(parts) > 2 else ''
        domain = parts[-2] if len(parts) >= 2 else hostname
        return subdomain, domain
    except:
        return '', ''

df['subdomain'], df['domain'] = zip(*df['URL'].map(extract_parts))
df['entropy_subdomain'] = df['subdomain'].apply(shannon_entropy)
df['entropy_domain'] = df['domain'].apply(shannon_entropy)

# (Optional) Drop the intermediate subdomain/domain columns if not needed
df.drop(columns=['subdomain', 'domain'], inplace=True)

In [8]:
df.drop(columns=['URL','URL_Length'], inplace=True)
df.drop(columns='entropy_domain',inplace=True)

In [9]:
df.head()

Unnamed: 0,Url_Shortening,Num_Languages,User_Info_Length,Num_Subdomains,TLD,Num_Query_Params,label,percent_Digits_Subdomain,percent_Hyphens_Subdomain,percent_Digits_Domain,...,percent_Encoded_Characters_Path,percent_Digits_Query,percent_Hyphens_Query,percent_Dots_Query,percent_Dash_Query,percent_Underscore_Query,percent_Percent_Query,percent_Ampersand_Query,percent_Encoded_Characters_Query,entropy_subdomain
0,1,1,0,0,285,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0,0,252,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,0,0,163,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,0,0,741,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0,0,1089,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.947703


In [11]:
df.corr()

Unnamed: 0,Url_Shortening,Num_Languages,User_Info_Length,Num_Subdomains,TLD,Num_Query_Params,label,percent_Digits_Subdomain,percent_Hyphens_Subdomain,percent_Digits_Domain,...,percent_Encoded_Characters_Path,percent_Digits_Query,percent_Hyphens_Query,percent_Dots_Query,percent_Dash_Query,percent_Underscore_Query,percent_Percent_Query,percent_Ampersand_Query,percent_Encoded_Characters_Query,entropy_subdomain
Url_Shortening,1.0,-0.072892,-0.003973,-0.03362,-0.043266,-0.038523,0.259363,-0.014161,-0.010165,-0.06543,...,-0.007559,-0.017914,-0.003154,0.026521,0.004975,0.004975,-0.00707,-0.053524,-0.007067,-0.173341
Num_Languages,-0.072892,1.0,0.00969,-0.01617,-0.00203,-0.055741,-0.07732,0.001405,-0.007652,0.76958,...,0.033402,-0.038592,-0.008928,-0.028985,-0.00898,-0.00898,-0.013103,-0.064491,-0.013106,0.053325
User_Info_Length,-0.003973,0.00969,1.0,0.027522,-0.010626,-0.002568,0.015318,0.022098,0.109959,-0.004113,...,0.016669,-0.002077,-0.000858,0.001816,-0.000855,-0.000855,0.005116,-0.003395,0.005122,0.022727
Num_Subdomains,-0.03362,-0.01617,0.027522,1.0,-0.029116,-0.003305,0.129625,0.413148,0.316785,-0.017347,...,0.009421,0.041951,0.011167,0.009226,0.025829,0.025829,0.007098,-0.019083,0.006928,0.225801
TLD,-0.043266,-0.00203,-0.010626,-0.029116,1.0,-0.093676,0.095166,0.007456,-0.010631,0.059196,...,-0.013748,-0.085161,-0.016579,-0.022202,-0.013699,-0.013699,-0.000694,-0.148409,-0.000665,-0.059256
Num_Query_Params,-0.038523,-0.055741,-0.002568,-0.003305,-0.093676,1.0,0.206712,0.010684,0.003117,0.055212,...,-0.000628,0.563332,0.165827,0.346188,0.165464,0.165464,0.221775,0.791957,0.221971,0.128479
label,0.259363,-0.07732,0.015318,0.129625,0.095166,0.206712,1.0,0.054599,0.03919,0.024271,...,0.030593,0.198006,0.056021,0.113324,0.055803,0.055803,0.053621,0.221623,0.05364,0.668332
percent_Digits_Subdomain,-0.014161,0.001405,0.022098,0.413148,0.007456,0.010684,0.054599,1.0,0.215034,1.7e-05,...,0.016391,0.044915,0.042239,0.027297,0.025419,0.025419,0.000456,-0.004961,0.000459,0.11461
percent_Hyphens_Subdomain,-0.010165,-0.007652,0.109959,0.316785,-0.010631,0.003117,0.03919,0.215034,1.0,-0.006632,...,0.001766,0.02018,0.003153,0.008024,0.020443,0.020443,0.010674,-0.004176,0.010687,0.085114
percent_Digits_Domain,-0.06543,0.76958,-0.004113,-0.017347,0.059196,0.055212,0.024271,1.7e-05,-0.006632,1.0,...,0.038721,0.025995,0.015557,0.066341,0.009862,0.009862,0.047864,-0.028945,0.047926,0.142509


In [None]:
df.to_csv('/content/clean_numTOper_entropy_136k.csv',index=False)