# **Phish in the Web**
### **Data Preprocessing**

In [1]:
#import libraries only ONCE
import pandas as pd
from sklearn.preprocessing import MinMaxScaler #no one hot // all columns numeric
from sklearn.model_selection import train_test_split #data splitting


#read in data & check for shape
phish_df = pd.read_csv("Phishing_Legitimate_full.csv")
phish_df.shape

(10000, 50)

**selecting columns to work with**

In [2]:
#specifying the columns we want to keep
selected_columns = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
    'NumDashInHostname', 'AtSymbol','TildeSymbol', 'NumUnderscore','NumPercent',
    'NumQueryComponents','NumAmpersand','NumHash','NumNumericChars','NoHttps',
    'RandomString','IpAddress','DomainInSubdomains','DomainInPaths','HttpsInHostname','HostnameLength',
    'PathLength','QueryLength','DoubleSlashInPath','NumSensitiveWords','EmbeddedBrandName',
    'PctExtHyperlinks','PctExtResourceUrls','ExtFavicon','InsecureForms','RelativeFormAction',
    'ExtFormAction','AbnormalFormAction','PctNullSelfRedirectHyperlinks','FrequentDomainNameMismatch',
    'FakeLinkInStatusBar','RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame',
    'MissingTitle','ImagesOnlyInForm','SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT',
    'AbnormalExtFormActionR','ExtMetaScriptLinkRT','PctExtNullSelfRedirectHyperlinksRT', 'CLASS_LABEL']

#features
feature_columns = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol','TildeSymbol', 'NumUnderscore','NumPercent','NumQueryComponents','NumAmpersand','NumHash','NumNumericChars','NoHttps','RandomString','IpAddress','DomainInSubdomains','DomainInPaths','HttpsInHostname','HostnameLength','PathLength','QueryLength','DoubleSlashInPath','NumSensitiveWords','EmbeddedBrandName','PctExtHyperlinks','PctExtResourceUrls','ExtFavicon','InsecureForms','RelativeFormAction', 'ExtFormAction','AbnormalFormAction','PctNullSelfRedirectHyperlinks','FrequentDomainNameMismatch','FakeLinkInStatusBar','RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame','MissingTitle','ImagesOnlyInForm','SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT','AbnormalExtFormActionR','ExtMetaScriptLinkRT','PctExtNullSelfRedirectHyperlinksRT']
    #without class label

#label
label_columns = ['CLASS_LABEL']

**creating the feature & label data frames**

In [3]:
phishing_features_df = phish_df[feature_columns]
phishing_label_df = phish_df[label_columns]

phishing_features_df.head()

Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT
0,3,1,5,72,0,0,0,0,0,0,...,0,0,0,1,1,0,1,1,-1,1
1,3,1,3,144,0,0,0,0,2,0,...,0,0,0,0,1,-1,1,1,1,1
2,3,1,2,58,0,0,0,0,0,0,...,0,0,0,0,1,0,-1,1,-1,0
3,3,1,6,79,1,0,0,0,0,0,...,1,0,0,0,1,-1,1,1,1,-1
4,3,0,4,46,0,0,0,0,0,0,...,0,1,0,0,1,1,-1,0,-1,-1


### **Feature Engineering**

In [4]:
#initiate scaler
min_scaler = MinMaxScaler()

#scale features
phishing_features_df[feature_columns] = min_scaler.fit_transform(phishing_features_df[feature_columns])

phishing_features_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phishing_features_df[feature_columns] = min_scaler.fit_transform(phishing_features_df[feature_columns])


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,...,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT
0,0.1,0.071429,0.277778,0.248963,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.5,1.0,1.0,0.0,1.0
1,0.1,0.071429,0.166667,0.547718,0.0,0.0,0.0,0.0,0.111111,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
2,0.1,0.071429,0.111111,0.190871,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.5,0.0,1.0,0.0,0.5
3,0.1,0.071429,0.333333,0.278008,0.018182,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
4,0.1,0.0,0.222222,0.141079,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.5,0.0,0.0


### **Data Splitting**

splitting data into 70% train and 20% to test and 10% validation

In [5]:
#data splitting

#split the data into 70% train, 20% test, and 10% validation
x_train, x_temp, y_train, y_temp = train_test_split(phishing_features_df, phishing_label_df, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.7, random_state=42)

#split data!
x_train, x_test, y_train, y_test = train_test_split(phishing_features_df, phishing_label_df, test_size=0.2, random_state=42)

#print training data shape and label's shape
print (f"Training: Features' shape [no. of examples * feature vector size] =  {x_train.shape}")
print (f"Training: Label's shape [no. of examples * 1] = {y_train.shape}\n")

#print test data shape and label's shape
print (f"Test: Features' shape [no. of examples * feature vector size] =  {x_test.shape}")
print (f"Test: Label's shape [no. of examples * 1] = {y_test.shape}\n")

#print validation data shape and label's shape
print(f"Validation: Features' shape [no. of examples * feature vector size] = {x_val.shape}")
print(f"Validation: Label's shape [no. of examples * 1] = {y_val.shape}\n")

Training: Features' shape [no. of examples * feature vector size] =  (8000, 48)
Training: Label's shape [no. of examples * 1] = (8000, 1)

Test: Features' shape [no. of examples * feature vector size] =  (2000, 48)
Test: Label's shape [no. of examples * 1] = (2000, 1)

Validation: Features' shape [no. of examples * feature vector size] = (900, 48)
Validation: Label's shape [no. of examples * 1] = (900, 1)



# **COPY BELOW FOR OTHER FILES**



In [6]:
#import libraries only ONCE
import pandas as pd
from sklearn.preprocessing import MinMaxScaler #no one hot // all columns numeric
from sklearn.model_selection import train_test_split #data splitting


#read in data & check for shape
phish_df = pd.read_csv("Phishing_Legitimate_full.csv")
phish_df.shape

#specifying the columns we want to keep
selected_columns = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
    'NumDashInHostname', 'AtSymbol','TildeSymbol', 'NumUnderscore','NumPercent',
    'NumQueryComponents','NumAmpersand','NumHash','NumNumericChars','NoHttps',
    'RandomString','IpAddress','DomainInSubdomains','DomainInPaths','HttpsInHostname','HostnameLength',
    'PathLength','QueryLength','DoubleSlashInPath','NumSensitiveWords','EmbeddedBrandName',
    'PctExtHyperlinks','PctExtResourceUrls','ExtFavicon','InsecureForms','RelativeFormAction',
    'ExtFormAction','AbnormalFormAction','PctNullSelfRedirectHyperlinks','FrequentDomainNameMismatch',
    'FakeLinkInStatusBar','RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame',
    'MissingTitle','ImagesOnlyInForm','SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT',
    'AbnormalExtFormActionR','ExtMetaScriptLinkRT','PctExtNullSelfRedirectHyperlinksRT', 'CLASS_LABEL']

#features
feature_columns = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 'NumDashInHostname', 'AtSymbol','TildeSymbol', 'NumUnderscore','NumPercent','NumQueryComponents','NumAmpersand','NumHash','NumNumericChars','NoHttps','RandomString','IpAddress','DomainInSubdomains','DomainInPaths','HttpsInHostname','HostnameLength','PathLength','QueryLength','DoubleSlashInPath','NumSensitiveWords','EmbeddedBrandName','PctExtHyperlinks','PctExtResourceUrls','ExtFavicon','InsecureForms','RelativeFormAction', 'ExtFormAction','AbnormalFormAction','PctNullSelfRedirectHyperlinks','FrequentDomainNameMismatch','FakeLinkInStatusBar','RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame','MissingTitle','ImagesOnlyInForm','SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT','AbnormalExtFormActionR','ExtMetaScriptLinkRT','PctExtNullSelfRedirectHyperlinksRT']
    #without class label

#label
label_columns = ['CLASS_LABEL']

phishing_features_df = phish_df[feature_columns]
phishing_label_df = phish_df[label_columns]

phishing_features_df.head()

#initiate scaler
min_scaler = MinMaxScaler()

#scale features // this includes id (maybe drop later)
phishing_features_df[feature_columns] = min_scaler.fit_transform(phishing_features_df[feature_columns])

phishing_features_df.head()

#data splitting

#split the data into 70% train, 20% test, and 10% validation
x_train, x_temp, y_train, y_temp = train_test_split(phishing_features_df, phishing_label_df, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.7, random_state=42)

#print training data shape and label's shape
print (f"Training: Features' shape [no. of examples * feature vector size] =  {x_train.shape}")
print (f"Training: Label's shape [no. of examples * 1] = {y_train.shape}\n")

#print test data shape and label's shape
print (f"Test: Features' shape [no. of examples * feature vector size] =  {x_test.shape}")
print (f"Test: Label's shape [no. of examples * 1] = {y_test.shape}\n")

#print validation data shape and label's shape
print(f"Validation: Features' shape [no. of examples * feature vector size] = {x_val.shape}")
print(f"Validation: Label's shape [no. of examples * 1] = {y_val.shape}\n")

Training: Features' shape [no. of examples * feature vector size] =  (7000, 48)
Training: Label's shape [no. of examples * 1] = (7000, 1)

Test: Features' shape [no. of examples * feature vector size] =  (2100, 48)
Test: Label's shape [no. of examples * 1] = (2100, 1)

Validation: Features' shape [no. of examples * feature vector size] = (900, 48)
Validation: Label's shape [no. of examples * 1] = (900, 1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phishing_features_df[feature_columns] = min_scaler.fit_transform(phishing_features_df[feature_columns])
