# Feature Testing

In [71]:
import pandas as pd
import seaborn as sns
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score, precision_score, recall_score
from scipy import stats

In [72]:
numerical_df = pd.read_csv('clean_data.csv')
# drop the columns that do not start with 'n_', but keep 'status' column
numerical_df = numerical_df.loc[:, numerical_df.columns.str.startswith('n_') | numerical_df.columns.str.startswith('status')]
numerical_df.dropna(inplace=True)
numerical_df.head(10)

Unnamed: 0,status,n_period,n_hyphens,n_underscore,n_slash,n_questionmrk,n_equals,n_at,n_and,n_exclamation
0,0.0,2,0,0,3,0,0,0,0,0
1,0.0,2,1,0,3,0,0,0,0,0
2,0.0,2,1,0,0,0,0,0,0,0
3,0.0,1,0,0,0,0,0,0,0,0
4,0.0,3,0,0,10,1,0,0,0,0
5,0.0,3,2,0,0,0,0,0,0,0
6,0.0,3,0,0,7,0,0,0,0,0
7,0.0,2,1,1,7,0,0,0,0,0
8,0.0,5,2,0,3,0,0,0,0,0
9,0.0,4,0,0,0,0,0,0,0,0


In [73]:
# conduct t-tests for each feature
t_statistics = []
p_values = []
for col in numerical_df.columns[1:]:
    t_statistic, p_value = stats.ttest_rel(numerical_df[col], numerical_df['status'])
    t_statistics.append(t_statistic)
    p_values.append(p_value)
    
t_test_results = pd.DataFrame({ 'Feature': numerical_df.columns[1:], 'T-Statistic': t_statistics, 'P-Value': p_values })
t_test_results.head(15)

Unnamed: 0,Feature,T-Statistic,P-Value
0,n_period,299.328181,0.0
1,n_hyphens,68.579023,0.0
2,n_underscore,-55.995829,0.0
3,n_slash,324.403275,0.0
4,n_questionmrk,-136.647134,0.0
5,n_equals,-94.057516,0.0
6,n_at,-311.535731,0.0
7,n_and,-139.853987,0.0
8,n_exclamation,-320.477032,0.0


In [55]:
categorical_df = pd.read_csv('parsed_url_features.csv')
# drop non numeric columns
for col in categorical_df.columns:
    if categorical_df[col].dtype != 'object' and col != 'status' and col != 'domain_name_correct':
        categorical_df = categorical_df.drop(col, axis=1)
categorical_df.head(20)

Unnamed: 0,url,protocol,www_present,sub_domain,domain,top_domain,dir,file,path,fragment,query,status,domain_name_correct
0,https://ghfdc.knuodwq.cn/,https,,ghfdc,knuodwq,cn,,,/,,,0.0,1
1,https://rakutenluyaw.ouxawer-p.net/,https,,rakutenluyaw,ouxawer-p,net,,,/,,,0.0,1
2,pulsagratiss-1010.000webhostapp.com,,,pulsagratiss-1010,000webhostapp,com,,,,,,0.0,1
3,idjvn.com,,,,idjvn,com,,,,,,0.0,0
4,http://www.vmveg.com//vendor/phpunit/phpunit/s...,http,www,www.,vmveg,com,,/vendor/phpunit/phpunit/src/Util/PHP/redirect.php,//vendor/phpunit/phpunit/src/Util/PHP/redirect...,,{'emx': None},0.0,0
5,manage.xoom.unusual-error-supprt.com,,,manage.xoom,unusual-error-supprt,com,,,,,,0.0,0
6,coldeng.com.br/verb/rip/rip/rip/Match/match2/i...,,,,coldeng,com.br,/verb/rip/rip/rip/Match/match2/,index.htm,/verb/rip/rip/rip/Match/match2/index.htm,,,0.0,0
7,mytincorp.com/templates/beez/html/com_user/res...,,,,mytincorp,com,/templates/beez/html/com_user/reset/Suporte.on...,,/templates/beez/html/com_user/reset/Suporte.on...,,,0.0,1
8,https://www.amazon.co.jp.a1d4w64c4-f4-nr4wq.pa...,https,www,www.amazon.co.jp,a1d4w64c4-f4-nr4wq,party,,,/,,,0.0,1
9,paypal.co.uk.q47c.top,,,paypal.co.uk,q47c,top,,,,,,0.0,1


In [64]:
# manually count how many 'www' there are in the column 'www_present'
www_present = 0
www_not_present = 0
for i in range(len(categorical_df)):
    if categorical_df['www_present'][i] == 'www':
        www_present += 1
    else:
        www_not_present += 1
        
print('www present:', www_present)
print('www not present:', www_not_present)

www present: 0
www not present: 100000


In [57]:
# conduct chi-squared tests for each feature
chi2_statistics = []
p_values = []
for col in categorical_df.columns[1:]:
    observed = pd.crosstab(categorical_df[col], categorical_df['status'])
    chi2, p, dof, expected = stats.chi2_contingency(observed)
    chi2_statistics.append(chi2)
    p_values.append(p)
    
chi2_test_results = pd.DataFrame({ 'Feature': categorical_df.columns[1:], 'Chi2-Statistic': chi2_statistics, 'P-Value': p_values })
chi2_test_results.head(15)


Unnamed: 0,Feature,Chi2-Statistic,P-Value
0,protocol,940.810726,5.076292e-205
1,www_present,0.0,1.0
2,sub_domain,29852.254983,0.0
3,domain,97443.401718,0.0
4,top_domain,33554.287128,0.0
5,dir,48986.611294,0.0
6,file,47740.122712,0.0
7,path,65312.794174,2.122732e-130
8,fragment,71.0,0.1565715
9,query,10310.919812,6.545826e-30


In [74]:
cat_df = pd.read_csv('clean_data.csv')
# drop columns that start with 'n_' 
for col in cat_df.columns:
    if col.startswith('n_'):
        cat_df = cat_df.drop(col, axis=1)
cat_df.dropna(inplace=True)
cat_df.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,url,protocol,www_present,sub_domain,domain,top_domain,dir,file,path,fragment,query,status,url_length,domain_name_correct
0,0,11,https://ghfdc.knuodwq.cn/,1,0,1,1,1,0,0,1,0,0,0.0,25,1
1,1,12,https://rakutenluyaw.ouxawer-p.net/,1,0,1,1,1,0,0,1,0,0,0.0,35,1
2,2,13,pulsagratiss-1010.000webhostapp.com,0,0,1,1,1,0,0,0,0,0,0.0,35,1
3,3,14,idjvn.com,0,0,0,1,1,0,0,0,0,0,0.0,9,0
4,4,15,http://www.vmveg.com//vendor/phpunit/phpunit/s...,1,1,1,1,1,0,1,1,0,1,0.0,74,0
5,5,16,manage.xoom.unusual-error-supprt.com,0,0,1,1,1,0,0,0,0,0,0.0,36,0
6,6,17,coldeng.com.br/verb/rip/rip/rip/Match/match2/i...,0,0,0,1,1,1,1,1,0,0,0.0,54,0
7,7,18,mytincorp.com/templates/beez/html/com_user/res...,0,0,0,1,1,1,0,1,0,0,0.0,74,1
8,8,19,https://www.amazon.co.jp.a1d4w64c4-f4-nr4wq.pa...,1,1,1,1,1,0,0,1,0,0,0.0,50,1
9,9,20,paypal.co.uk.q47c.top,0,0,1,1,1,0,0,0,0,0,0.0,21,1


In [75]:
# conduct chi-squared tests for each feature
chi2_statistics = []
p_values = []
for col in cat_df.columns[1:]:
    observed = pd.crosstab(cat_df[col], cat_df['status'])
    chi2, p, dof, expected = stats.chi2_contingency(observed)
    chi2_statistics.append(chi2)
    p_values.append(p)
    
chi2_test_results = pd.DataFrame({ 'Feature': cat_df.columns[1:], 'Chi2-Statistic': chi2_statistics, 'P-Value': p_values })
chi2_test_results.head(15)


Unnamed: 0,Feature,Chi2-Statistic,P-Value
0,Unnamed: 0,98572.0,0.4985025
1,url,98572.0,0.2914978
2,protocol,18346.402832,0.0
3,www_present,303.543643,5.568595e-68
4,sub_domain,1422.604309,2.573313e-311
5,domain,0.0,1.0
6,top_domain,208.581101,2.8017810000000003e-47
7,dir,11712.235592,0.0
8,file,14120.322537,0.0
9,path,20686.045724,0.0
