# fetch the content of all of the publisher and ad urls
this includes the title, the meta description, and an extraction of the body of each page
i also stem the words of the body here so that they're directly accessible later
all of this is stored into the postgres database 

In [1]:
import pandas as pd
from lxml import html
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.stem.snowball import SnowballStemmer
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

In [2]:
# import csv
# adjust the "raw_df" object to reflect the cleaned csv that you want to import 
raw_df = pd.read_csv('experiment1_data.csv')
#raw_df = pd.read_csv('experiment2_data.csv')

#simplify to the unit pairs instead of includn all of the scores 
feature_df = raw_df[['_unit_id', 'starturl', 'endurl']].drop_duplicates()

### define methods to extract and process url content

In [3]:
# domain grabber
def get_domain(url):
    start_index = url.find('://') + 3
    if url.find('www.') != -1:
        start_index = start_index + 4
    end_index = url.find('.com')
    if end_index == -1:
        end_index = url.find('.net')
    if end_index == -1:
        end_index = url.find('.org')
    domain = url[start_index:end_index]
    return domain

In [15]:
# title and text grabber 
def get_title_and_text_and_description(url): 
    #load the page from the url
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    #pull the title
    if soup.title == None:
        title = 'none'
    else:
        title = soup.title.string
    
    # get the body
    text = soup.stripped_strings
    strings = list()
    for string in text:
        strings.append(string)
    
    # get the description from the meta tags
    desc= soup.find(attrs={'name':'Description'})
    if desc == None:
        desc = soup.findAll(attrs={"name":"description"})
    if desc == None:
        desc = soup.finalAll(attrs={"name":"Description"})
    if len(desc) != 0:
        description = desc[0]['content'].encode('utf-8')
    else:
        description = "no description"
    print description
    return title, strings, description

In [16]:
# list of characters indicative of javascript
bad_characters = ['}','{', '[' , ']' , '|', '/', '_', '\\', '\t', '\n', '\'','\r']

# score the degree to which the string actually resembles language as spoken by a human 
def language_score(string): 
    baddies = float()
    for bad_character in bad_characters:
        baddies = baddies + string.count(bad_character)
    score = baddies/len(string)       
    return score

# since this is bag of words, i'm just going to put them all together instead of worrying about the list output
# there are two parameters to tune: how long the strings have to be, and what ratio of nonsense characters is tolerated
def get_content(string_list):
    content = str()
    nonsense_threshold = .0005
    length_threshold = 40

    for string in string_list:
        if language_score(string) < nonsense_threshold:
            if len(string) > length_threshold:
                content += string.encode('ascii','ignore') + ' '
    return content 

In [17]:
# extract the keywords 

stemmer = SnowballStemmer("english")

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [18]:
## go through each of the urls and get the titles and the content of the page
def process_urls(list_of_urls):   
    url_list = []
    domains = []
    titles = []
    descriptions = []
    extracted_texts = []
    stemmed_word_lists = []

    counter = 0
    end_count = len(list_of_urls)

    for url in list_of_urls:
        try:
            if np.isnan(url):
                print "oops"
                url_list.append('na')
                domains.append('na')
                titles.append('na')
                extracted_texts.append('na')
                stemmed_word_lists(['na'])
                continue
            
        except TypeError:

            url_list.append(url)
            domains.append(get_domain(url))
            (title, text, description) = get_title_and_text_and_description(url)
            titles.append(title)
            descriptions.append(description)
            extracted_text = get_content(text)
            extracted_texts.append(extracted_text)
            stemmed_word_lists.append(tokenize_and_stem(extracted_text)) 
            
            
            # the progress counter bit for my own sanity check 
            counter += 1
            print round(((counter + 0.0)/end_count)*100,2)

    return [url_list, domains, titles, descriptions, extracted_texts, stemmed_word_lists]

### process the urls
work through all of the start urls, then end urls
this could be  sped up by doing this AFTER processing the judgments so that the ~5% of publisher/advertiser url pairs that get eliminiated are not processed 


In [None]:
start_content = process_urls(feature_df['starturl'])

Listen and sing "Tweedle Dee" by LaVern Baker from "Departed" soundtrack. Song lyrics: Tweedlee tweedlee tweedlee dee I'm as happy as can be Jimminy cricket jimminy jack You make my heart go clickety-clack Tweedlee tweed
0.11
Listen and sing "Tweedle Dee" by LaVern Baker from "Departed" soundtrack. Song lyrics: Tweedlee tweedlee tweedlee dee I'm as happy as can be Jimminy cricket jimminy jack You make my heart go clickety-clack Tweedlee tweed
0.21
Listen and sing "Honey Bun" by  from "South Pacific" soundtrack. Song lyrics: Nellie: My doll is as dainty as a sparrow, Her figure is somethin' to applaud. Where she's narrow she's as narrow an arrow, And she's broad wh
0.32
Listen and sing "Honey Bun" by  from "South Pacific" soundtrack. Song lyrics: Nellie: My doll is as dainty as a sparrow, Her figure is somethin' to applaud. Where she's narrow she's as narrow an arrow, And she's broad wh
0.42
Listen and sing "He Is Not Dead Yet" by Spamalot Cast from "Spamalot" soundtrack. Song lyrics: R

In [None]:
end_content = process_urls(feature_df['endurl'])

### output to the database 

In [15]:
# output to the webpage table in the db

# need libraries for conneting with databse 
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

# create sql engine
dbname = 'urx'
username = 'noahburbank'
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

In [16]:
# convert all of my labors into a data frame
start_df = pd.DataFrame(data=start_content)
start_df = start_df.transpose()
start_df['_unit_id'] = list(feature_df['_unit_id'])
end_df = pd.DataFrame(data=end_content)
end_df = end_df.transpose()
end_df['_unit_id'] = list(feature_df['_unit_id'])

# merge them together
all_web_data = pd.merge(start_df, end_df, on = '_unit_id')

# make nicer
all_web_data.columns = ['starturl', 'start_domain', 'start_titles', "start_description", 'start_extracted_content', 
                        'start_stemmed_word_list', '_unit_it','endurl', 'end_domain', 'end_titles', "end_decription",
                        'end_extracted_content', 'end_stemmed_word_list']

In [1]:
# upload to the database
# make sure to adjust the  name that you upload the data to to avoid confusing the different experiments
all_web_data.to_sql('webpage_data_table_exp2', engine, if_exists = 'replace')

NameError: name 'all_web_data' is not defined