# Feature Testing

<p> The goal of this notebook is to check out our features and do some t-tests to see if they reject the null hypothesis </p> 

**Import Statements**

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score, precision_score, recall_score
from scipy import stats

<h3>Load in the numerical data</h3>

In [2]:
numerical_df = pd.read_csv('clean_data.csv')
# drop the columns that do not start with 'n_', but keep 'status' column
numerical_df = numerical_df.loc[:, numerical_df.columns.str.startswith('n_') | numerical_df.columns.str.startswith('status')]
numerical_df.dropna(inplace=True)
numerical_df.head(10)

Unnamed: 0,status,n_period,n_hyphens,n_underscore,n_slash,n_questionmrk,n_equals,n_at,n_and,n_exclamation
0,0.0,2,0,0,3,0,0,0,0,0
1,0.0,2,1,0,3,0,0,0,0,0
2,0.0,2,1,0,0,0,0,0,0,0
3,0.0,1,0,0,0,0,0,0,0,0
4,0.0,3,0,0,10,1,0,0,0,0
5,0.0,3,2,0,0,0,0,0,0,0
6,0.0,3,0,0,7,0,0,0,0,0
7,0.0,2,1,1,7,0,0,0,0,0
8,0.0,5,2,0,3,0,0,0,0,0
9,0.0,4,0,0,0,0,0,0,0,0


<h3> Conduct t-test on numerical features</h3>

In [3]:
# conduct t-tests for each feature
t_statistics = []
p_values = []
for col in numerical_df.columns[1:]:
    t_statistic, p_value = stats.ttest_rel(numerical_df[col], numerical_df['status'])
    t_statistics.append(t_statistic)
    p_values.append(p_value)
    
t_test_results = pd.DataFrame({ 'Feature': numerical_df.columns[1:], 'T-Statistic': t_statistics, 'P-Value': p_values })
t_test_results.head(15)

Unnamed: 0,Feature,T-Statistic,P-Value
0,n_period,299.328181,0.0
1,n_hyphens,68.579023,0.0
2,n_underscore,-55.995829,0.0
3,n_slash,324.403275,0.0
4,n_questionmrk,-136.647134,0.0
5,n_equals,-94.057516,0.0
6,n_at,-311.535731,0.0
7,n_and,-139.853987,0.0
8,n_exclamation,-320.477032,0.0


<h3>Load in categorical data</h3>

In [4]:
cat_df = pd.read_csv('clean_data.csv')
# drop columns that start with 'n_' 
for col in cat_df.columns:
    if col.startswith('n_'):
        cat_df = cat_df.drop(col, axis=1)
cat_df.drop('Unnamed: 0', axis=1, inplace=True)
cat_df.drop('url_length', axis=1, inplace=True)
cat_df.dropna(inplace=True)
cat_df.head(10)

Unnamed: 0,Unnamed: 0.1,url,protocol,www_present,sub_domain,domain,top_domain,dir,file,path,fragment,query,status,domain_name_correct
0,0,https://ghfdc.knuodwq.cn/,1,0,1,1,1,0,0,1,0,0,0.0,1
1,1,https://rakutenluyaw.ouxawer-p.net/,1,0,1,1,1,0,0,1,0,0,0.0,1
2,2,pulsagratiss-1010.000webhostapp.com,0,0,1,1,1,0,0,0,0,0,0.0,1
3,3,idjvn.com,0,0,0,1,1,0,0,0,0,0,0.0,0
4,4,http://www.vmveg.com//vendor/phpunit/phpunit/s...,1,1,1,1,1,0,1,1,0,1,0.0,0
5,5,manage.xoom.unusual-error-supprt.com,0,0,1,1,1,0,0,0,0,0,0.0,0
6,6,coldeng.com.br/verb/rip/rip/rip/Match/match2/i...,0,0,0,1,1,1,1,1,0,0,0.0,0
7,7,mytincorp.com/templates/beez/html/com_user/res...,0,0,0,1,1,1,0,1,0,0,0.0,1
8,8,https://www.amazon.co.jp.a1d4w64c4-f4-nr4wq.pa...,1,1,1,1,1,0,0,1,0,0,0.0,1
9,9,paypal.co.uk.q47c.top,0,0,1,1,1,0,0,0,0,0,0.0,1


<h3>Conduct chi-squared tests for each categorical feature</h3>

In [5]:
# conduct chi-squared tests for each feature
chi2_statistics = []
p_values = []
for col in cat_df.columns[1:]:
    observed = pd.crosstab(cat_df[col], cat_df['status'])
    chi2, p, dof, expected = stats.chi2_contingency(observed)
    chi2_statistics.append(chi2)
    p_values.append(p)
    
chi2_test_results = pd.DataFrame({ 'Feature': cat_df.columns[1:], 'Chi2-Statistic': chi2_statistics, 'P-Value': p_values })
chi2_test_results.head(15)


Unnamed: 0,Feature,Chi2-Statistic,P-Value
0,url,98572.0,0.2914978
1,protocol,18346.402832,0.0
2,www_present,303.543643,5.568595e-68
3,sub_domain,1422.604309,2.573313e-311
4,domain,0.0,1.0
5,top_domain,208.581101,2.8017810000000003e-47
6,dir,11712.235592,0.0
7,file,14120.322537,0.0
8,path,20686.045724,0.0
9,fragment,3.505695,0.06115817


<p>As a result of this notebook, my conclusion is that most of the features that have been constructed are useful in some sort of way except 'domain' and 'fragment' features which have do not reject the null hypothesis</p>