In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer


In [2]:
HEADING_LIST = ['h1','h2','h3','h4','h5','h6']
EMPHASISED_LIST = ['strong','b','a', 'blockquote', 'cite', 'em','i','mark','q','pre','u']

webpage_data = pd.read_csv("webpages.csv", low_memory=False, encoding = "ISO-8859-1")
tags =  list(webpage_data.Tag)
urls =  list(webpage_data.URL)
sentences =  list(webpage_data.Sentence)
y_true = list(webpage_data.TrueValue)
webpage_list = list(zip(urls, sentences, tags, y_true))

In [3]:
train, test = train_test_split(webpage_list, test_size = 0.33, random_state = 49)


In [None]:
train

In [4]:
X_train = [sentence for url, sentence, tag, ytrue in train]
Y_train = np.array([ytrue for url, sentence, tag, ytrue in train], dtype = 'float')
X_test = [sentence for url, sentence, tag, ytrue in test]
Y_test = np.array([ytrue for url, sentence, tag, ytrue in test], dtype = 'float')
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(13532, 10443)

In [20]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(13532, 10443)

In [25]:

is_emphasised = np.array([1 if tag in EMPHASISED_LIST else 0 for url, sentence, tag, y_label in train])
is_heading = np.array([1 if tag in HEADING_LIST else 0 for url, sentence, tag, y_label in train])
is_title = np.array([1 if tag == 'title' else 0 for url, sentence, tag, y_label in train])
features = np.column_stack((is_heading, is_emphasised, is_title))
features.shape

(13532, 3)

In [13]:
from scipy.sparse import coo_matrix, hstack
A = coo_matrix(X_train_tfidf)
B = coo_matrix(features)
C =hstack([A,B])
C.shape
Y_train.shape

C.shape

(13532, 10446)

In [10]:
X_tests_counts = count_vect.transform(X_test)
X_tests_tfidf = tfidf_transformer.transform(X_tests_counts)
X_tests_counts.shape

(6666, 10443)

In [11]:
X_tests_tfidf = tfidf_transformer.transform(X_tests_counts)
X_tests_tfidf.shape

(6666, 10443)

In [28]:
is_emphasised_test = np.array([1 if tag in EMPHASISED_LIST else 0 for url, sentence, tag, y_label in test])
is_heading_test = np.array([1 if tag in HEADING_LIST else 0 for url, sentence, tag, y_label in test])
is_title_test = np.array([1 if tag == 'title' else 0 for url, sentence, tag, y_label in test])
features_test = np.column_stack((is_heading_test, is_emphasised_test, is_title_test))
features_test.shape

D = coo_matrix(X_tests_tfidf)
E = coo_matrix(features_test)
F =hstack([D,E])
F.shape

(6666, 10446)

In [29]:
from sklearn import linear_model                                                                                                                                              

lin = linear_model.LinearRegression() #initialize regressor                                                                                                                   

lin.fit(C, Y_train) #fit training data                                                                                                                                  
preds = lin.predict(F) #make prediction on X test set   

In [30]:
len(preds)

6666

In [35]:
from sklearn import metrics    
from sklearn.metrics import mean_squared_error, r2_score
print('Mean Absolute Error: \n',metrics.mean_absolute_error(Y_test, preds, multioutput='uniform_average')) #evaluate performance 

# The coefficients
print('Coefficients: \n', lin.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(Y_test, preds))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(Y_test, preds))

Mean Absolute Error: 
 0.0963000117927
Coefficients: 
 [ 0.41695143  0.07709095 -0.27087183 ..., -0.00284868 -0.0095109   0.        ]
Mean squared error: 0.04
Variance score: -0.77


In [None]:
def featurizer(train):
    # create your list that will soon be a list of tuples - there is a tuple for each movie which contains 
    # a dictionary of the movie's features and its genre tag
    total_feature_set = []
    true_labels = []

    # loop through the movies in the training set
    for url, sentence, tag, y_label in train:
        # for each movie, create a dictionary that will hold its features
        feature_dict = {}
        # loop through vocabulary and mark if each word is present or not (Boolean)
        
        feature_dict['is_heading'] = 1 if tag in HEADING_LIST else 0
        feature_dict['is_emphasised'] = 1 if tag in EMPHASISED_LIST else 0
        feature_dict['is_title'] = 1 if tag == 'title' else 0

        total_feature_set.append([feature_dict['is_heading'], feature_dict['is_emphasised'], feature_dict['is_title']])
        #append it all to the main feature set
        true_labels.append(y_label)
        
    return (total_feature_set, true_labels)

In [None]:
title_list = []
for url, sentence, tag, y_label in train:
    if tag == 'h1':
        title_list.append(sentence)
title_list

In [None]:

training_features, true_labels = featurizer(train)
len(training_features)
true_labels[0:10]

In [37]:
cur_url = test[0][0]
cur_max = 0.0
best_sentence = ''
dict_final = {}
print(cur_url)
for i in range(0, len(test)):
    if cur_url == test[i][0]:
        if cur_max < preds[i]:
            cur_max = preds[i]
            best_sentence = test[i][1]
    else:
        cur_max = max(preds[i], 0.0)
        dict_final[cur_url] = (best_sentence, cur_max)
        best_sentence = test[i][1]
        cur_url = test[i][0]

dict_final

https://medium.com/p/83486f42118c


{'https://blog.heartsupport.com/a-letter-to-my-daughter-about-young-men-2bab2fca4971': ('Get updates',
  0.30278869062572361),
 'https://blog.prototypr.io/the-ideal-design-workflow-2c200b8e337d': ('and start creating the UI for your app',
  0.26321332207678094),
 'https://hackernoon.com/how-it-feels-to-learn-javascript-in-2016-d3a717dd577f': (' But they are not cool anymore',
  0.0),
 'https://m.signalvnoise.com/being-tired-isn-t-a-badge-of-honor-fa6d4c8cff4e': (' And thats OK, because the exhaustion is not sustained; its temporary',
  0.20286659507372623),
 'https://medium.com/@ev/welcome-to-medium-9e53ca408c48': ('Ev Williams',
  0.86570956823959899),
 'https://medium.com/@maxbraun/my-bathroom-mirror-is-smarter-than-yours-94b21c6671ba': (', and most recently',
  0.23943631204430646),
 'https://medium.com/@shitHRCcantsay/let-me-remind-you-fuckers-who-i-am-e6e8b297fe47': (' So stop making me dab on Ellen and just give me a fucking chance already',
  0.17842984405861706),
 'https://medi