In [2]:
import pandas as pd
from lxml import html
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.stem.snowball import SnowballStemmer
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

In [3]:
# import data
raw_df = pd.read_csv('f835051.csv')

# remove the golden ones which are for training
raw_df = raw_df[raw_df['_golden'] == False]
#simplify to the unit pairs instead of includn all of the scores 
feature_df = raw_df[['_unit_id', 'starturl', 'endurl']].drop_duplicates()

In [4]:
# remove the entries for which there is no destination url 

nan_list = []

for endurl in feature_df['endurl']:
     nan_list.append(type(endurl) != type(float()))

reduced_feature_df = feature_df[nan_list]      
print reduced_feature_df.shape
print feature_df.shape

# you need to remove the rows for which there is no destination url 

feature_df = reduced_feature_df

(950, 3)
(994, 3)


In [5]:
# domain grabber
def get_domain(url):
    start_index = url.find('://') + 3
    if url.find('www.') != -1:
        start_index = start_index + 4
    end_index = url.find('.com')
    if end_index == -1:
        end_index = url.find('.net')
    if end_index == -1:
        end_index = url.find('.org')
    domain = url[start_index:end_index]
    return domain

In [6]:
# title and text grabber 
def get_title_and_text(url): 
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.title.string

    text = soup.stripped_strings
    strings = list()
    for string in text:
        strings.append(string)
    return title, strings

In [7]:
# get the actual content of the website 

# first get rid of the nonsense characters  
bad_characters = ['}',
                  '{', 
                  '[' , 
                  ']' , 
                  '|', 
                  '/', 
                  '_', 
                  '\\', 
                  '\t', 
                  '\n', 
                  '\'',
                  '\r']

# score the degree to which the string actually resembles language as spoken by a human 
def language_score(string): 
    baddies = float()
    for bad_character in bad_characters:
        baddies = baddies + string.count(bad_character)
    score = baddies/len(string)       
    return score

# since this is bag of words, i'm just going to put them all together instead of worrying about the list output
# there are two parameters to tune: how long the strings have to be, and what ratio of nonsense characters is tolerated
def get_content(string_list):
    content = str()
    nonsense_threshold = .0005
    length_threshold = 40

    for string in string_list:
        if language_score(string) < nonsense_threshold:
            if len(string) > length_threshold:
                content += string.encode('ascii','ignore') + ' '
    return content 

In [8]:
# extract the keywords 

stemmer = SnowballStemmer("english")


def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [9]:
## go through each of the urls and get the titles and the content of the page
def process_urls(list_of_urls):   
    url_list = []
    domains = []
    titles = []
    extracted_texts = []
    stemmed_word_lists = []

    counter = 0
    end_count = len(list_of_urls)

    for url in list_of_urls:
        try:
            if np.isnan(url):
                print "oops"
                url_list.append('na')
                domains.append('na')
                titles.append('na')
                extracted_texts.append('na')
                stemmed_word_lists(['na'])
                continue
            
        except TypeError:

            url_list.append(url)
            domains.append(get_domain(url))
            (title, text) = get_title_and_text(url)
            titles.append(title)
            extracted_text = get_content(text)
            extracted_texts.append(extracted_text)
            stemmed_word_lists.append(tokenize_and_stem(extracted_text)) 
            
            # the progress counter bit for my own sanity check 
            counter += 1
            print round(((counter + 0.0)/end_count)*100,2)

    return [url_list, domains, titles, extracted_texts, stemmed_word_lists]

In [10]:
start_content = process_urls(feature_df['starturl'])

0.11
0.21
0.32
0.42
0.53
0.63
0.74
0.84
0.95
1.05
1.16
1.26
1.37
1.47
1.58
1.68
1.79
1.89
2.0
2.11
2.21
2.32
2.42
2.53
2.63
2.74
2.84
2.95
3.05
3.16
3.26
3.37
3.47
3.58
3.68
3.79
3.89
4.0
4.11
4.21
4.32
4.42
4.53
4.63
4.74
4.84
4.95
5.05
5.16
5.26
5.37
5.47
5.58
5.68
5.79
5.89
6.0
6.11
6.21
6.32
6.42
6.53
6.63
6.74
6.84
6.95
7.05
7.16
7.26
7.37
7.47
7.58
7.68
7.79
7.89
8.0
8.11
8.21
8.32
8.42
8.53
8.63
8.74
8.84
8.95
9.05
9.16
9.26
9.37
9.47
9.58
9.68
9.79
9.89
10.0
10.11
10.21
10.32
10.42
10.53
10.63
10.74
10.84
10.95
11.05
11.16
11.26
11.37
11.47
11.58
11.68
11.79
11.89
12.0
12.11
12.21
12.32
12.42
12.53
12.63
12.74
12.84
12.95
13.05
13.16
13.26
13.37
13.47
13.58
13.68
13.79
13.89
14.0
14.11
14.21
14.32
14.42
14.53
14.63
14.74
14.84
14.95
15.05
15.16
15.26
15.37
15.47
15.58
15.68
15.79
15.89
16.0
16.11
16.21
16.32
16.42
16.53
16.63
16.74
16.84
16.95
17.05
17.16
17.26
17.37
17.47
17.58
17.68
17.79
17.89
18.0
18.11
18.21
18.32
18.42
18.53
18.63
18.74
18.84
18.95
19.05
19.16
19.26
19.37

In [14]:
end_content = process_urls(feature_df['endurl'])

0.11
0.21
0.32
0.42
0.53
0.63
0.74
0.84
0.95
1.05
1.16
1.26
1.37
1.47
1.58
1.68
1.79
1.89
2.0
2.11
2.21
2.32
2.42
2.53
2.63
2.74
2.84
2.95
3.05
3.16
3.26
3.37
3.47
3.58
3.68
3.79
3.89
4.0
4.11
4.21
4.32
4.42
4.53
4.63
4.74
4.84
4.95
5.05
5.16
5.26
5.37
5.47
5.58
5.68
5.79
5.89
6.0
6.11
6.21
6.32
6.42
6.53
6.63
6.74
6.84
6.95
7.05
7.16
7.26
7.37
7.47
7.58
7.68
7.79
7.89
8.0
8.11
8.21
8.32
8.42
8.53
8.63
8.74
8.84
8.95
9.05
9.16
9.26
9.37
9.47
9.58
9.68
9.79
9.89
10.0
10.11
10.21
10.32
10.42
10.53
10.63
10.74
10.84
10.95
11.05
11.16
11.26
11.37
11.47
11.58
11.68
11.79
11.89
12.0
12.11
12.21
12.32
12.42
12.53
12.63
12.74
12.84
12.95
13.05
13.16
13.26
13.37
13.47
13.58
13.68
13.79
13.89
14.0
14.11
14.21
14.32
14.42
14.53
14.63
14.74
14.84
14.95
15.05
15.16
15.26
15.37
15.47
15.58
15.68
15.79
15.89
16.0
16.11
16.21
16.32
16.42
16.53
16.63
16.74
16.84
16.95
17.05
17.16
17.26
17.37
17.47
17.58
17.68
17.79
17.89
18.0
18.11
18.21
18.32
18.42
18.53
18.63
18.74
18.84
18.95
19.05
19.16
19.26
19.37

In [15]:
# put this into the database, so that i don't need to process it each time, 
#becuase it is very slow
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

dbname = 'urx'
username = 'noahburbank'

engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

In [None]:
# convert all of my labors into data frame
start_df = pd.DataFrame(data=start_content)
start_df = start_df.transpose()
start_df.columns = ['starturl', 'start_domain', 'start_titles', 'start_extracted_content', 'start_stemmed_word_list']
start_df['_unit_id'] = list(feature_df['_unit_id'])

end_df = pd.DataFrame(data=end_content)
end_df = end_df.transpose()
end_df['_unit_id'] = list(feature_df['_unit_id'])
end_df.columns = ['endurl', 'end_domain', 'end_titles', 'end_extracted_content', 'end_stemmed_word_list', '_unit_id']

In [None]:
# merge them together
all_web_data = pd.merge(start_df, end_df, on = '_unit_id')

In [None]:
# upload to the database
all_web_data.to_sql('webpage_data_table', engine, if_exists = 'replace')