In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests, json

from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# 크롤링
from bs4 import BeautifulSoup
import openpyxl

# Linear, Logistic
from sklearn.linear_model import LinearRegression, LogisticRegression

# knn
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# decision
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# svm
from sklearn.svm import SVC, SVR

# bagging
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# boosting
from xgboost import XGBClassifier, XGBRegressor

## [미니프로젝트] 악성사이트 탐지 머신러닝 모델 개발

### 여러분은 기업 보안팀에서 근무중인 엔지니어로써, 웹페이지에서 추출한 Feature(특징) 기반으로 악성사이트를 탐지하는 머신러닝 모델 개발 미션을 부여받았습니다.

### ▣ 우리가 풀어야 하는 문제는 무엇인가요?
 - 웹 페이지에서 Feature를 추출하세요.
 - 악성사이트 여부를 판별하는 성능 좋은 AI모델을 생성하세요.

<br>

---

## ▣ 데이터 소개
* 웹 크롤링 데이터셋 : Feature_Website.xlsx

## ▣ 웹 크롤링 데이터셋의 변수 소개
* html_code : 크롤링을 활용해 수집한 HTML Code 원본
* repu : 악성사이트 여부 (malicious : 악성사이트, benign : 정상사이트)
<br>

---

## <b>[1단계] 데이터 수집</b>

* 1단계에서는 크롤링으로 수집한 HTML Code를 활용해 Feature를 만드는 과정을 체험합니다.

# <b>Step 0. 본격적인 실습 전 packages 설치
* Beautifulsoup 라이브러리 설치
* openpyxl 라이브러리 설치

* 데이터 프레임 관련 라이브러리 Import

---
## <b>Q1. 데이터 불러오기
### 정상/악성 HTML Code가 저장된 엑셀파일 불러오기
- 파일명 : Feature Website.xlsx


### <span style="color:darkred">[문제1] Pandas 라이브러리를 활용해서 'Feature Website.xlsx'파일을 'df' 변수에 저장하고 그 info()및 head()를 통해 데이터를 확인하세요.<span>

In [2]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_fw = pd.read_excel('Feature_Website.xlsx')
df_fw.shape

(40, 2)

In [3]:
# 데이터 프레임의 info를 확인합니다.
df_fw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   html_code  40 non-null     object
 1   repu       40 non-null     object
dtypes: object(2)
memory usage: 768.0+ bytes


---
# <b>Step 1. 데이터 수집

### 주어진 데이터로만 모델링 하는 경우는 드뭅니다.
### 주어진 데이터 외 추가로 데이터를 수집 또는 생성해야 하는 경우가 많습니다.
### 이번 과정에서는 웹 크롤러를 통해 수집된 정상/악성 사이트 HTML 데이터에서
### BeatifulSoup 라이브러리를 활용 필요한 Feature(특징)를 추출해 보도록 하겠습니다.
### 정상/악성 사이트 HTML Code는 사전에 수집하여 'Feature Website.xlsx' 파일에 저장해 두었습니다.


### <span style="color:blue">[예시] Beatuifulsoup 라이브러리를 활용 HTML code를 출력하고 \<title> 태그 길이를 계산합니다.<span>

In [4]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(df_fw['html_code'][0], 'html.parser')

*<span style="color:blue"> html code 출력<span>

In [5]:
print(soup)

<!DOCTYPE html>

<!--[if lt IE 7]> <html lang="en-us" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->
<!--[if IE 7]>    <html lang="en-us" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->
<!--[if IE 8]>    <html lang="en-us" class="a-no-js a-lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="a-no-js" lang="en-us"><!--<![endif]--><head>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<title dir="ltr">Amazon.com</title>
<meta content="width=device-width" name="viewport"/>
<link href="https://images-na.ssl-images-amazon.com/images/G/01/AUIClients/AmazonUI-3c913031596ca78a3768f4e934b1cc02ce238101.secure.min._V1_.css" rel="stylesheet"/>
<script>

if (true === true) {
    var ue_t0 = (+ new Date()),
        ue_csm = window,
        ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
        ue_furl = "fls-na.amazon.com",
        ue_mid = "ATVPDKIKX0DER"

* <span style="color:blue"> \<title> 태그 출력 및 길이 계산<span>

In [6]:
# <title> 태그 출력
print("* title :",soup.head.title, soup.head.title.getText())

# <title> 태그 길이 출력
print("* title 길이 :", len(str(soup.head.title.getText())))

* title : <title dir="ltr">Amazon.com</title> Amazon.com
* title 길이 : 10


In [7]:
def title_length(soup):
    try:
        return len(str(soup.head.title.getText()))
    except:
        return 0

In [8]:
title_len = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    title_len.append(title_length(soup))

df_fw['title_length'] = title_len
df_fw.head()

Unnamed: 0,html_code,repu,title_length
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27


---

## <b>Q2. html 에서 \<script>...\</script> 태그 길이 계산
- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- float으로 return 받기

### <span style="color:darkred">[문제2] Beatuifulsoup 라이브러리를 활용 HTML code에서 \<script> 태그 길이를 계산하는 함수를 완성하고 결과를 확인하세요.<span>

In [9]:
# Feature(특징) 데이터를 추출는 함수를 작성합니다.
def script_length(soup):
    try:
        return len(str(soup.head.script.getText()))
    except:
        return 0

In [10]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
script_len = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    script_len.append(script_length(soup))

df_fw['script_length'] = script_len
df_fw.head()


Unnamed: 0,html_code,repu,title_length,script_length
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408


In [11]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
script_len = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    script = soup.findAll('script')
    lenght = 0
    for s in script:
        lenght += len(str(s))
    script_len.append(lenght)


df_fw['script_length_total'] = script_len
df_fw.head()


Unnamed: 0,html_code,repu,title_length,script_length,script_length_total
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425


---

## <b>Q3. html에서 공백 수 계산

- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- float으로 return 받기

### <span style="color:darkred">[문제3] Beatuifulsoup 라이브러리를 활용 HTML Code에서 \<html> 태그 공백 수를 계산하는 함수를 완성하고 결과를 확인하세요.<span>

In [12]:
# Feature(특징) 데이터를 추출는 함수를 작성합니다.
def get_body(soup):
    try:
        return str(soup.body.getText())
    except:
        return ''

In [13]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.

blank_n = []
blank_t = []
blank_nbsp = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    body_ = get_body(soup)
    body_ = str(body_)

    n = body_.count('\n')
    t = body_.count('\t')
    nbsp = body_.count('&nbsp')
    blank = body_.count(' ')

    blank_n.append(n)
    blank_t.append(t)
    blank_nbsp.append(nbsp + blank)

df_fw['blank_n'] = blank_n
df_fw['blank_t'] = blank_t
df_fw['blank_nbsp'] = blank_nbsp
df_fw

Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802
5,_x000D_\n_x000D_\n_x000D_\n<!DOCTYPE html>_x00...,malicious,17,3163,3197,0,0,0
6,"<!doctype html>\n\n<html data-ytrk-page=""HOME""...",malicious,36,4992,24936,80,0,153
7,"\n\t<!DOCTYPE html>\n\t<html class=""no-icon-fo...",malicious,45,128,24089,0,0,0
8,"<!DOCTYPE html>\n<html class=""no-js"">\n<head>\...",malicious,77,0,1389,352,0,86
9,"<!DOCTYPE html>\n<html class=""b-header--bl...",malicious,14,102,15086,0,0,0


---

## <b>Q4. html 에서 body 길이 계산

- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- float으로 return 받기

### <span style="color:darkred">[문제4] Beatuifulsoup 라이브러리를 활용 HTML code에서 \<body> 태그 길이를 계산하는 함수를 완성하고 결과를 확인하세요.<span>

In [14]:
# Feature(특징) 데이터를 추출는 함수를 작성합니다.
def body_length(soup):
    try:
        return len(str(soup.body.getText()))
    except:
        return 0


In [15]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
body_len = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    body_len.append(body_length(soup))

df_fw['body_length'] = body_len
df_fw.head()


Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp,body_length
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65,402
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87,1041
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190,433
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802,2969


---

## <b>Q5. script 에서 src, href 속성을 가진 태그수

- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- float으로 return 받기

### <span style="color:darkred">[문제5] Beatuifulsoup 라이브러리를 활용 HTML code에서 \<script> 태그에서 src, href 속성을 가진 태그수를 계산하는 함수를 완성하고 결과를 확인하세요. <span>


In [16]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
link_count = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    src_count = len(soup.findAll('script', src=True))
    href_count = len(soup.findAll('script', href=True))
    link_count.append(src_count+href_count)
    
df_fw['script_link_count'] = link_count
df_fw.head()


Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp,body_length,script_link_count
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65,402,0
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87,1041,4
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190,433,8
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802,2969,0


In [17]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
link_count = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    src_count = len(soup.findAll('a', src=True))
    href_count = len(soup.findAll('a', href=True))
    link_count.append(src_count+href_count)
    
df_fw['a_link_count'] = link_count
df_fw.head()


Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp,body_length,script_link_count,a_link_count
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65,402,0,2
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87,1041,4,31
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190,433,8,15
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802,2969,0,35


## <b>Q6. 추가적으로 도출 가능한 Feature

- BeautifulSoup으로 html소스를 python 객체로 변환
- 함수로 구현하기
- 적절한 자료형으로 return 받기

### <span style="color:darkred">[문제6] Beatuifulsoup 라이브러리를 활용 HTML code에서 추가로 만들수 있는 Feature를 찾아보고 결과를 확인하세요. <span>


In [18]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
img_count = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    img = len(soup.findAll('img'))
    img_count.append(img)
    
df_fw['img_count'] = img_count
df_fw.head()


Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp,body_length,script_link_count,a_link_count,img_count
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65,402,0,2,2
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87,1041,4,31,0
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190,433,8,15,0
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0,0,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802,2969,0,35,2


In [19]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
iframe_count = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    iframe = len(soup.findAll('iframe'))
    iframe_count.append(iframe)
    
df_fw['iframe_count'] = iframe_count
df_fw.head()


Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp,body_length,script_link_count,a_link_count,img_count,iframe_count
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65,402,0,2,2,0
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87,1041,4,31,0,0
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190,433,8,15,0,0
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0,0,0,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802,2969,0,35,2,0


In [20]:
soup = BeautifulSoup(df_fw['html_code'][0], 'html.parser')
script = soup.findAll('script')
script

[<script>
 
 if (true === true) {
     var ue_t0 = (+ new Date()),
         ue_csm = window,
         ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
         ue_furl = "fls-na.amazon.com",
         ue_mid = "ATVPDKIKX0DER",
         ue_sid = (document.cookie.match(/session-id=([0-9-]+)/) || [])[1],
         ue_sn = "opfcaptcha.amazon.com",
         ue_id = 'BY6PY7346W2THV0MAFYZ';
 }
 </script>,
 <script>
            if (true === true) {
              document.write('<img src="https://fls-na.amaz'+'on.com/'+'1/oc-csi/1/OP/requestId=BY6PY7346W2THV0MAFYZ&js=1" />');
            };
           </script>,
 <script>
     if (true === true) {
         var head = document.getElementsByTagName('head')[0],
             prefix = "https://images-na.ssl-images-amazon.com/images/G/01/csminstrumentation/",
             elem = document.createElement("script");
         elem.src = prefix + "csm-captcha-instrumentation.min.js";
         head.appendChild(elem);
 
         elem = docu

In [21]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
tag_link_count = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    count = len(soup.findAll('link'))
    tag_link_count.append(count)
    
df_fw['tag_link_count'] = tag_link_count
df_fw


Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp,body_length,script_link_count,a_link_count,img_count,iframe_count,tag_link_count
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65,402,0,2,2,0,1
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87,1041,4,31,0,0,138
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190,433,8,15,0,0,7
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0,0,0,0,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802,2969,0,35,2,0,77
5,_x000D_\n_x000D_\n_x000D_\n<!DOCTYPE html>_x00...,malicious,17,3163,3197,0,0,0,0,0,0,0,0,15
6,"<!doctype html>\n\n<html data-ytrk-page=""HOME""...",malicious,36,4992,24936,80,0,153,792,6,42,1,0,10
7,"\n\t<!DOCTYPE html>\n\t<html class=""no-icon-fo...",malicious,45,128,24089,0,0,0,0,4,0,0,0,50
8,"<!DOCTYPE html>\n<html class=""no-js"">\n<head>\...",malicious,77,0,1389,352,0,86,2032,0,198,0,0,5
9,"<!DOCTYPE html>\n<html class=""b-header--bl...",malicious,14,102,15086,0,0,0,0,1,0,0,0,2


In [22]:
# 데이터 프레임의 html_code 컬럼에서 Feature(특징) 데이터를 추출합니다.
tag_count = []

for index, row in df_fw.iterrows():
    soup = BeautifulSoup(row.html_code, 'html.parser')
    count = 0
    count += len(soup.findAll('link'))
    count += len(soup.findAll('script'))
    count += len(soup.findAll('meta'))
    tag_count.append(count)
    
df_fw['tag_count'] = tag_count
df_fw


Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp,body_length,script_link_count,a_link_count,img_count,iframe_count,tag_link_count,tag_count
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65,402,0,2,2,0,1,8
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87,1041,4,31,0,0,138,159
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190,433,8,15,0,0,7,46
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0,0,0,0,0,0,0,0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802,2969,0,35,2,0,77,99
5,_x000D_\n_x000D_\n_x000D_\n<!DOCTYPE html>_x00...,malicious,17,3163,3197,0,0,0,0,0,0,0,0,15,29
6,"<!doctype html>\n\n<html data-ytrk-page=""HOME""...",malicious,36,4992,24936,80,0,153,792,6,42,1,0,10,48
7,"\n\t<!DOCTYPE html>\n\t<html class=""no-icon-fo...",malicious,45,128,24089,0,0,0,0,4,0,0,0,50,87
8,"<!DOCTYPE html>\n<html class=""no-js"">\n<head>\...",malicious,77,0,1389,352,0,86,2032,0,198,0,0,5,16
9,"<!DOCTYPE html>\n<html class=""b-header--bl...",malicious,14,102,15086,0,0,0,0,1,0,0,0,2,21


In [23]:
df_fw['link_percentage'] = round((df_fw['script_link_count'] + df_fw['tag_link_count']) / df_fw['tag_count'], 4) * 100

In [24]:
df_fw['link_percentage'] = df_fw['link_percentage'].fillna(0.0)
df_fw

Unnamed: 0,html_code,repu,title_length,script_length,script_length_total,blank_n,blank_t,blank_nbsp,body_length,script_link_count,a_link_count,img_count,iframe_count,tag_link_count,tag_count,link_percentage
0,<!DOCTYPE html>\n<!--[if lt IE 7]> <html lang=...,malicious,10,388,1127,70,0,65,402,0,2,2,0,1,8,12.5
1,\n\t\n\n\n\t\n\n\t\n\n\n\t\n\n\n\t\n\n\t\n\t\t...,malicious,5,562,993,262,36,87,1041,4,31,0,0,138,159,89.31
2,"<!DOCTYPE html>\n<html lang=""en"">\n <head>\...",malicious,71,800,10358,111,0,190,433,8,15,0,0,7,46,32.61
3,"<!DOCTYPE html><html lang=""en""><head><style da...",malicious,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
4,<!DOCTYPE html>\n\n\n \n \n \n \n ...,malicious,27,408,425,415,0,1802,2969,0,35,2,0,77,99,77.78
5,_x000D_\n_x000D_\n_x000D_\n<!DOCTYPE html>_x00...,malicious,17,3163,3197,0,0,0,0,0,0,0,0,15,29,51.72
6,"<!doctype html>\n\n<html data-ytrk-page=""HOME""...",malicious,36,4992,24936,80,0,153,792,6,42,1,0,10,48,33.33
7,"\n\t<!DOCTYPE html>\n\t<html class=""no-icon-fo...",malicious,45,128,24089,0,0,0,0,4,0,0,0,50,87,62.07
8,"<!DOCTYPE html>\n<html class=""no-js"">\n<head>\...",malicious,77,0,1389,352,0,86,2032,0,198,0,0,5,16,31.25
9,"<!DOCTYPE html>\n<html class=""b-header--bl...",malicious,14,102,15086,0,0,0,0,1,0,0,0,2,21,14.29
