## Scraping to create feature variables

In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import chromedriver_binary 
import os
import re
import requests
import json
import csv 
import nltk
import xml.etree.ElementTree as ET


# Features - Number of Words, Num of Unique Words, etc. 
# 

## Function to extract the XML files from the SiteMap

The extract_xml_files_from_sitemap function takes a sitemap_url parameter, downloads the sitemap file using requests.get, and parses the XML content using xml.etree.ElementTree. It then finds all the <loc> elements in the sitemap using the appropriate namespace, filters the URLs to only include those ending with .xml, and returns a list of the XML file URLs.

In [4]:
def extract_xml_files_from_sitemap(sitemap_url):
    # Download the sitemap file
    response = requests.get(sitemap_url)
    sitemap_content = response.content

    # Parse the XML content
    root = ET.fromstring(sitemap_content)

    # Find all the URLs in the sitemap
    urls = root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")

    # Extract the XML file URLs
    xml_files = [url.text for url in urls if url.text.endswith(".xml")]

    return xml_files

sitemap_url = "https://docs.netapp.com/cloud/sitemap.xml"
xml_files = extract_xml_files_from_sitemap(sitemap_url)

for file in xml_files: 
    print(file)


https://docs.netapp.com/cloud/sitemap-us-en.xml
https://docs.netapp.com/us-en/active-iq-unified-manager/sitemap.xml
https://docs.netapp.com/us-en/active-iq-unified-manager-97/sitemap.xml
https://docs.netapp.com/us-en/active-iq-unified-manager-98/sitemap.xml
https://docs.netapp.com/us-en/active-iq-unified-manager-99/sitemap.xml
https://docs.netapp.com/us-en/active-iq-unified-manager-910/sitemap.xml
https://docs.netapp.com/us-en/active-iq-unified-manager-912/sitemap.xml
https://docs.netapp.com/us-en/active-iq/sitemap.xml
https://docs.netapp.com/us-en/astra-automation-2211/sitemap.xml
https://docs.netapp.com/us-en/astra-automation-2208/sitemap.xml
https://docs.netapp.com/us-en/astra-automation-2204/sitemap.xml
https://docs.netapp.com/us-en/astra-automation-2112/sitemap.xml
https://docs.netapp.com/us-en/astra-automation-2108/sitemap.xml
https://docs.netapp.com/us-en/astra-automation/sitemap.xml
https://docs.netapp.com/us-en/astra-control-center-2211/sitemap.xml
https://docs.netapp.com/us-e

## Extract the html files from a given xml file on the sitemap

This code downloads the XML sitemap using requests.get, parses it using xml.etree.ElementTree, and then finds all the <loc> elements using the appropriate namespace. It filters the URLs to include only those ending with .html and returns a list of HTML file URLs.

In [6]:
def extract_html_files_from_sitemap(xml_url):
    # Download the sitemap file
    response = requests.get(xml_url)
    sitemap_content = response.content

    # Parse the XML content
    root = ET.fromstring(sitemap_content)

    # Find all the URLs in the sitemap
    urls = root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")

    # Extract the HTML file URLs
    html_files = [url.text for url in urls if url.text.endswith(".html")]

    return html_files

## Creating a List of all HTML Files

In [7]:
all_html_files =[]
final_html_files =[]
for file in range(10,20):
    html_files = extract_html_files_from_sitemap(xml_files[file])
    all_html_files.append(html_files)
for l in all_html_files:
    final_html_files += l
    

print(len(all_html_files))


10


## Creating CSV of HTML Files and Their Features

In [125]:
import csv
from itertools import zip_longest
d = [final_html_files]
export_data = zip_longest(*d, fillvalue = '')
with open('new_output.csv', 'w', encoding="ISO-8859-1", newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerows(export_data)
myfile.close()
df = pd.read_csv('new_output.csv')
df.to_csv('new_output.csv')

df = pd.read_csv('new_output.csv')
first_column = df.columns[0]
df = df.drop([first_column], axis=1)
df.to_csv('new_output.csv', index=False)


## Converting documents into text files

In [126]:
import re
def convert_html_to_text(url, output_file):
    # Fetch the HTML content from the URL
    response = requests.get(url)
    html_page = response.content
    soup = BeautifulSoup(html_page, 'html.parser')
    res = soup.find("article", id="main")
    text = res.get_text()
    text = text.strip()
    text = re.sub(r'\r\n|\r', ' ', text)
    # Write the text to a text file
    with open(output_file, "w") as file:
        file.write(text)

df = pd.read_csv('new_output.csv', header = None)
matrix1 = df[df.columns[0]].to_numpy()
urls = matrix1.tolist()

file_l = []
for i, url in enumerate(urls):
    output_file = f"files/output{i+1}.txt"  # Generate output file name
    file_l.append(output_file)
    convert_html_to_text(url, output_file)
df.insert(1,"File Name", file_l)
df.to_csv("new_output.csv")

Scraping URLs to find the total number of words and total number of unique words in each document.
extract_words is a function that reads all the words in a file and stores them in a list. 

In [98]:
# Extracts a list of all the words in the html document
def extract_words(text):
    words = []
    text_nodes  = text.split()
    for text in text_nodes:
        words.append(text)
    return words

# Function to find the number of words in the document
def calc_total_words_in_doc(word_list):
    total_words = []
    total_words.append(len(word_list))
    return total_words

def calc_unique_words_in_doc(word_list):
    total_words = []
    unique_words = []
    total_words.append(len(word_list))
    wordset = set(word_list)
    unique_words.append(len(wordset))
    return unique_words

final_total_words =[]
final_unique_words = []
file_list = os.listdir("./files/")
for file in file_list:
    text = open(f"./files/{file}").read()
    word_list = extract_words(text)
    total_words = calc_total_words_in_doc(word_list)
    final_total_words.append(total_words[0])
    unique_words = calc_unique_words_in_doc(word_list)
    final_unique_words.append(unique_words[0])

df.insert(2,"Total Words", final_total_words)
df.insert(3,"Unique Words", final_unique_words)
df.to_csv("new_output.csv")

## Total Number of Sentences in the Document

In [99]:
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def count_sentences(content):
    sentences = sent_tokenize(content)
    return len(sentences)


file_list = os.listdir("./files/")
sentence_counts = []
for file in file_list:
    text = open(f"./files/{file}").read()
    sentence_count = count_sentences(text)
    sentence_counts.append(sentence_count)
df.insert(4,"Total Sentence", sentence_counts)
df.to_csv("new_output.csv")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Syllables in Each Document

The count_syllables() function takes a word as input and uses the CMU Pronouncing Dictionary from NLTK (cmudict) to find the number of syllables in that word. The count_syllables_in_document() function takes a URL as input, retrieves the content using requests.get(), splits the content into words, and counts the total number of syllables by summing the syllable count for each word. 

In [100]:
from nltk.corpus import cmudict
nltk.download('cmudict')
import syllables


# Function to count syllables in a word
def count_syllables(word):
    syllable_count = syllables.estimate(word.strip())
    return syllable_count

# Function to count syllables in a document
def count_syllables_in_document(content):
    words = content.split()
    ctr = 0
    for word in words:
        ctr += count_syllables(word)
    return ctr

syllable_counts = []
file_list = os.listdir("./files/")
for file in file_list:
    text = open(f"./files/{file}").read()
    syllable_count = count_syllables_in_document(text)
    syllable_counts.append(syllable_count)
df.insert(5,"Total Syllables", syllable_counts)
df.to_csv("new_output.csv")

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


## Number of Characters in Document

In [106]:
def count_characters(text):
    character_count = len(text)
    return character_count

char_counts = []
file_list = os.listdir("./files/")
for file in file_list:
    text = open(f"./files/{file}").read()
    char_count = count_characters(text)
    char_counts.append(char_count)
df.insert(6,"Total Chars", char_counts)
print (df)
df.to_csv("new_output.csv")

                                                    0           File Name  \
0   https://docs.netapp.com/us-en/bluexp-reports/g...   files/output1.txt   
1   https://docs.netapp.com/us-en/bluexp-reports/g...   files/output2.txt   
2   https://docs.netapp.com/us-en/bluexp-reports/l...   files/output3.txt   
3   https://docs.netapp.com/us-en/bluexp-reports/r...   files/output4.txt   
4   https://docs.netapp.com/us-en/bluexp-reports/g...   files/output5.txt   
5   https://docs.netapp.com/us-en/bluexp-reports/g...   files/output6.txt   
6   https://docs.netapp.com/us-en/bluexp-reports/g...   files/output7.txt   
7   https://docs.netapp.com/us-en/bluexp-reports/u...   files/output8.txt   
8   https://docs.netapp.com/us-en/bluexp-reports/u...   files/output9.txt   
9   https://docs.netapp.com/us-en/bluexp-reports/u...  files/output10.txt   
10  https://docs.netapp.com/us-en/bluexp-reports/u...  files/output11.txt   
11  https://docs.netapp.com/us-en/bluexp-reports/u...  files/output12.txt   

## Number of Long Words in Document
Any words >6 chars in length

In [101]:
def long_words_in_doc(text):
    long_word_count = 0
    for word in text.split():
        if len(word) >= 6:
            long_word_count += 1
    return long_word_count

long_word_counts = []
file_list = os.listdir("./files/")
for file in file_list:
    text = open(f"./files/{file}").read()
    long_word_count = long_words_in_doc(text)
    long_word_counts.append(long_word_count)
df.insert(7,"Long Words", long_word_counts)
df.to_csv("new_output.csv")

docs.netapp.com
BlueXP
BlueXP
migration
reports
Frequently
questions
BlueXP
migration
reports
05/30/2023
you’re
looking
answer
question.
Access
BlueXP
migration
reports
service
What’s
BlueXP
migration
reports
service
browser,
enter:
https://console.bluexp.netapp.com/
access
BlueXP
console.
BlueXP
migration
reports?
BlueXP
migration
reports
service
require
BlueXP
migration
reports
option
automatically
enabled
BlueXP
navigation.
broker?
BlueXP
(Cloud
Sync),
broker.
However,
BlueXP
migration
reports
service
require
broker.
Licensing
license
BlueXP
migration
reports?
NetApp
License
required.
docs.netapp.com
BlueXP
BlueXP
migration
reports
Enable
report
notifications
05/30/2023
BlueXP
migration
reports
service,
notification
appears
BlueXP
notifications
BlueXP
notifications
report
generated.
select
"Info"
notification
setting.
BlueXP
select
Storage
Canvas.
notifications,
select
notifications
right.
enable
notification,
following:
Select
Settings
option
select
Alerts
Notification
Settings.
Se

## Number of Blank Spaces

In [119]:
def count_blanks(text):
    blank_count = 0
    for char in text:
        if char.isspace():
            blank_count += 1
    return blank_count

blank_counts = []
file_list = os.listdir("./files/")
for file in file_list:
    text = open(f"./files/{file}").read()
    blank_count = count_blanks(text)
    blank_counts.append(blank_count)
df.insert(8,"Blanks", blank_counts)
df.to_csv("new_output.csv")

## Number of Words with More Than 2 Syllables

In [123]:
# Function to count syllables in a word
def count_syllables(word):
    syllable_count = syllables.estimate(word.strip())
    return syllable_count

# Function to count syllables in a document
def count_two_syllables_in_document(content):
    words = content.split()
    ctr = 0
    for word in words:
        if count_syllables(word) > 1:
            ctr += 1
    return ctr

two_syl_counts = []
file_list = os.listdir("./files/")
for file in file_list:
    text = open(f"./files/{file}").read()
    two_syl_count = count_two_syllables_in_document(text)
    two_syl_counts.append(two_syl_count)
df.insert(8,">2 Syl Words", two_syl_counts)
df.to_csv("new_output.csv")

docs.netapp.com
migration
reports
Frequently
asked
questions
migration
reports
you’re
looking
answer
question.
Access
migration
reports
service
migration
reports
service
browser,
enter:
https://console.bluexp.netapp.com/
access
console.
migration
reports?
migration
reports
service
require
any
migration
reports
option
automatically
enabled
navigation.
data
broker?
copy
data
broker.
However,
migration
reports
service
require
data
broker.
Licensing
license
use
migration
reports?
NetApp
License
required.
docs.netapp.com
migration
reports
Enable
report
notifications
migration
reports
service,
notification
appears
notifications
page.
also
notifications
alert
users
email
report
generated.
select
"Info"
notification
setting.
select
Storage
Canvas.
notifications,
select
notifications
icon
upper
enable
email
notification,
following:
Select
Settings
option
select
Alerts
Notification
Settings.
Select
Additional
Recipients
Notifications
column,
expand
selecting
arrow.
notification
level,
select
Inf

ValueError: cannot insert >2 Syl Words, already exists

## Anderson's Readability Index
The Rix score is calculated by (number of long words/ number of sentences). 
Long words have more then 6 characters. 

In [111]:
input_file = 'new_output.csv'
output_file = 'scores.csv'

data = pd.read_csv(input_file)
data['RIX'] = round((data['Long Words'] / data['Total Sentence']), 3)
data.to_csv(output_file, index=False)
new_data = data[['RIX']]
new_data.to_csv(output_file, index=False)

## Automated Readability Index
4.71 x (number of characters/number of words) + 0.5 x (number of words/number of sentences) – 21.43

In [113]:
input_file = 'new_output.csv'
output_file = 'scores.csv'
data = pd.read_csv(input_file)
new_data['ARI'] = round((4.71*(data['Total Chars'] / data['Total Words']))+ (0.5*(data['Total Words']/data['Total Sentence']))-21.43, 3)
new_data.to_csv(output_file, index=False)

## Coleman-Liau Index
0.0588L - 0.296S - 15.8
where L is the average number of letters per 100 words
and S is the average number of sentences per 100 words


In [116]:
input_file = 'new_output.csv'
output_file = 'scores.csv'

data = pd.read_csv(input_file)
new_data['CLI'] = round((0.0588*100*(data['Total Chars'] / data['Total Words']))- (0.296*100*(data['Total Sentence']/data['Total Words']))-15.8, 3)
new_data.to_csv(output_file, index=False)

## Dickes-Steiwer Handformel
235.95993 - (7.3021 x average number of characters per word) - (12.56438 x average number of words per sentence) - (50.03293 x type token ratio)

In [118]:
input_file = 'new_output.csv'
output_file = 'scores.csv'

data = pd.read_csv(input_file)
new_data['DSH'] = round(235.95993 - (7.3021*(data['Total Chars']/ data['Total Words'])) - (12.56438*(data['Total Words']/ data['Total Sentence'])) - (50.03293*(data['Unique Words']/data['Total Words'])),3)
new_data.to_csv(output_file, index=False)

## Danielson Bryan
131.059 - (10.264 x (number of characters/ number of blanks)) + (0.0194 x (number of characters/ number of sentences))

In [120]:
input_file = 'new_output.csv'
output_file = 'scores.csv'

data = pd.read_csv(input_file)
new_data['DB'] = round((131.059 - (10.264*(data['Total Chars']/data['Blanks'])) + (0.0194*(data['Total Chars']/data['Total Sentence']))),3)
new_data.to_csv(output_file, index=False)

## Fang's Easy Listening Formula

