In [48]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

In [37]:
HEADING_LIST = ['h1','h2','h3','h4','h5','h6']
EMPHASISED_LIST = ['strong','b','a', 'blockquote', 'cite', 'em','i','mark','q','pre','u']

webpage_data = pd.read_csv("webpages.csv", low_memory=False, encoding = "ISO-8859-1")
tags =  list(webpage_data.Tag)
urls =  list(webpage_data.URL)
sentences =  list(webpage_data.Sentence)
y_true = list(webpage_data.TrueValue)
webpage_list = list(zip(urls, sentences, tags, y_true))
webpage_list

[('https://medium.com/@ev/welcome-to-medium-9e53ca408c48',
  'Welcome toMedium',
  'h1',
  0.12244898),
 ('https://medium.com/@ev/welcome-to-medium-9e53ca408c48',
  'Welcome toMedium',
  nan,
  0.12244898),
 ('https://medium.com/@ev/welcome-to-medium-9e53ca408c48',
  'Bring Your Stories andIdeas',
  'h2',
  0.22564102600000002),
 ('https://medium.com/@ev/welcome-to-medium-9e53ca408c48',
  'Bring Your Stories andIdeas',
  nan,
  0.22564102600000002),
 ('https://medium.com/@ev/welcome-to-medium-9e53ca408c48',
  'Medium is a new place on the Internet where people share ideas and stories that are longer than 140 characters and not just for friends',
  'p',
  0.33757961799999997),
 ('https://medium.com/@ev/welcome-to-medium-9e53ca408c48',
  ' Its designed for little stories that make your day better and manifestos that change the world',
  'p',
  0.33802816899999999),
 ('https://medium.com/@ev/welcome-to-medium-9e53ca408c48',
  ' Its used by everyone from professional journalists to amateur

In [38]:
train, test = train_test_split(webpage_list, test_size = 0.33, random_state = 49)


In [39]:
train

[('https://medium.com/swlh/how-technology-hijacks-peoples-minds-from-a-magician-and-google-s-design-ethicist-56d62ef5edf3',
  'conscious choice',
  nan,
  0.091836735000000003),
 ('https://medium.com/p/8dc34fbe592b',
  'GE CEO Jeff Immelt wakes up at 5:30am and gets in a cardio workout while reading the paper and watching CNBC',
  nan,
  0.27397260299999998),
 ('https://medium.com/the-mission/50-ways-happier-healthier-and-more-successful-people-live-on-their-own-terms-31ba8f482448',
  'Heres a tip if youre just starting out: start your shower warm, as usual',
  nan,
  0.23134328399999998),
 ('https://medium.com/p/a988c17383a6',
  ' The more perfectionistic the professor, the less productive they are, Dr',
  nan,
  0.33175355499999998),
 ('https://medium.com/the-mission/50-ways-happier-healthier-and-more-successful-people-live-on-their-own-terms-31ba8f482448',
  ' Stop pretending youll live forever',
  nan,
  0.19512195100000002),
 ('https://medium.com/@thatdavidhopkins/how-a-tv-sitcom-

In [40]:
def featurizer(train):
    # create your list that will soon be a list of tuples - there is a tuple for each movie which contains 
    # a dictionary of the movie's features and its genre tag
    total_feature_set = []
    true_labels = []

    # loop through the movies in the training set
    for url, sentence, tag, y_label in train:
        # for each movie, create a dictionary that will hold its features
        feature_dict = {}
        # loop through vocabulary and mark if each word is present or not (Boolean)
        
        feature_dict['is_heading'] = 1 if tag in HEADING_LIST else 0
        feature_dict['is_emphasised'] = 1 if tag in EMPHASISED_LIST else 0
        feature_dict['is_title'] = 1 if tag == 'title' else 0

        total_feature_set.append([feature_dict['is_heading'], feature_dict['is_emphasised'], feature_dict['is_title']])
        #append it all to the main feature set
        true_labels.append(y_label)
        
    return (total_feature_set, true_labels)

In [46]:

training_features, true_labels = featurizer(train)
len(training_features)
true_labels[0:10]

[0.091836735000000003,
 0.27397260299999998,
 0.23134328399999998,
 0.33175355499999998,
 0.19512195100000002,
 0.32900432899999998,
 0.33333333300000001,
 0.24175824199999998,
 0.321428571,
 0.26213592199999997]

ValueError: Unknown label type: (array([ 0.09183674,  0.2739726 ,  0.23134328, ...,  0.18257261,
        0.2885906 ,  0.24074074]),)

In [102]:



is_emphasised = np.array([1 if tag in EMPHASISED_LIST else 0 for url, sentence, tag, y_label in train])
is_heading = np.array([1 if tag in HEADING_LIST else 0 for url, sentence, tag, y_label in train])
is_title = np.array([1 if tag == 'title' else 0 for url, sentence, tag, y_label in train])
features = np.column_stack((is_heading, is_emphasised, is_title))
y_true = np.array([y_label for url, sentence, tag, y_label in train], dtype='float')

In [104]:
y_true

array([ 0.09183674,  0.2739726 ,  0.23134328, ...,  0.18257261,
        0.2885906 ,  0.24074074])

In [110]:
is_emphasised_test = np.array([1 if tag in EMPHASISED_LIST else 0 for url, sentence, tag, y_label in test])
is_heading_test = np.array([1 if tag in HEADING_LIST else 0 for url, sentence, tag, y_label in test])
is_title_test = np.array([1 if tag == 'title' else 0 for url, sentence, tag, y_label in test])
features_test = np.column_stack((is_heading_test, is_emphasised_test, is_title_test))
y_test = np.array([y_label for url, sentence, tag, y_label in test], dtype='float')


In [111]:
from sklearn import linear_model                                                                                                                                              

lin = linear_model.LinearRegression() #initialize regressor                                                                                                                   

lin.fit(features, y_true) #fit training data                                                                                                                                  
preds = lin.predict(features_test) #make prediction on X test set   

In [108]:
preds

array([ 0.22263501,  0.27305838,  0.21309868, ...,  0.27305838,
        0.27305838,  0.27305838])

In [114]:
from sklearn import metrics    
metrics.mean_absolute_error(y_test, preds) #evaluate performance 

0.090437289801165457

In [119]:
cur_url = test[0][0]
cur_max = 0.0
best_sentence = ''
dict_final = {}
print(cur_url)
for i in range(0, len(test)):
    if cur_url == test[i][0]:
        #print(cur_url, preds[i], test[i][1])
        if cur_max < preds[i]:
            cur_max = preds[i]
            best_sentence = test[i][1]
    else:
        cur_max = preds[i]
        dict_final[cur_url] = best_sentence
        best_sentence = ''

dict_final

https://medium.com/p/83486f42118c


{'https://medium.com/p/83486f42118c': ''}

In [81]:
y_true.shape

(13532,)

In [82]:
y_true

array([ 0.09183674,  0.2739726 ,  0.23134328, ...,  0.18257261,
        0.2885906 ,  0.24074074])

In [83]:
y

array([1, 2, 3, 4, 5, 6])

In [84]:
features

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ..., 
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [85]:
X

array([[2, 0, 4, 0, 3, 3, 2, 2, 0, 1, 4, 2, 1, 3, 3, 4, 0, 0, 3, 0, 1, 4,
        0, 0, 0, 4, 2, 0, 3, 3, 2, 1, 1, 3, 3, 2, 4, 2, 2, 2, 0, 3, 4, 4,
        0, 4, 3, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 4, 0, 1, 0, 4, 3, 3, 0, 0,
        1, 0, 4, 0, 1, 2, 3, 1, 3, 0, 4, 4, 2, 0, 1, 2, 2, 4, 0, 3, 3, 3,
        2, 0, 1, 0, 4, 4, 2, 2, 2, 0, 0, 0],
       [0, 0, 3, 2, 4, 4, 0, 1, 0, 0, 2, 4, 0, 4, 4, 1, 0, 1, 3, 2, 0, 2,
        0, 2, 2, 0, 3, 3, 3, 0, 1, 1, 2, 0, 4, 0, 3, 4, 4, 4, 1, 0, 2, 2,
        1, 4, 1, 2, 4, 4, 2, 4, 4, 2, 1, 3, 1, 2, 2, 0, 1, 1, 0, 1, 1, 0,
        3, 1, 1, 2, 1, 2, 3, 3, 4, 0, 4, 3, 2, 2, 3, 3, 1, 4, 2, 4, 0, 0,
        1, 4, 0, 4, 3, 2, 3, 4, 0, 4, 1, 3],
       [1, 2, 0, 1, 0, 1, 0, 2, 3, 3, 2, 4, 4, 3, 4, 0, 4, 0, 4, 0, 2, 3,
        0, 4, 0, 2, 4, 0, 3, 3, 0, 0, 4, 2, 2, 3, 0, 2, 4, 1, 2, 2, 2, 3,
        4, 2, 2, 4, 3, 4, 0, 0, 0, 3, 4, 2, 1, 4, 0, 4, 1, 4, 1, 4, 0, 2,
        1, 2, 2, 2, 2, 3, 2, 0, 2, 3, 3, 2, 4, 3, 0, 0, 0, 4, 3, 1, 2, 3,
        1, 2, 0, 3, 3,