In [1]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

import pandas as pd
import re 

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split

from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
df = pd.read_csv('../datasets/train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [3]:
# extract df[0] for checking the content
tmp = df['Page content'][1]
tmp = BeautifulSoup(tmp, 'html.parser')
with open("content2.html", "w") as f:
    f.write(str(tmp.prettify()))


In [7]:
def attribute_attract(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # extract images
    images = soup.find_all('img')
    # extract links
    links = soup.find_all('a')
    
    # extract title
    h1_tag = soup.find('h1', {'class': 'title'})
    title = ""
    if h1_tag is not None:
        title = h1_tag.text
    
    # extract categories 
    categories = []

        # Find the <a> tags
    for a_tag in soup.find_all('a', href=True):
        # Extract the href attribute
        href_value = a_tag['href']
        
        # Use regex to extract the category
        match = re.search(r'category/(.*)/', href_value)
        
        if match:
            categories.append(match.group(1))
    
    # extract channel
    channel_tag = soup.find('meta', {'property': 'article:section'})
    article_tag = soup.find('article', {'data-channel': True})
    channel = article_tag['data-channel']

    return len(images), len(links), title, categories, len(categories), channel

df[['img_count', 'link_count', 'title', 'categories', 'categories_count', 'channel']] = df['Page content'].apply(attribute_attract).apply(pd.Series)
print(df.head(5))

   Id  Popularity                                       Page content  \
0   0          -1  <html><head><div class="article-info"> <span c...   
1   1           1  <html><head><div class="article-info"><span cl...   
2   2           1  <html><head><div class="article-info"><span cl...   
3   3          -1  <html><head><div class="article-info"><span cl...   
4   4          -1  <html><head><div class="article-info"><span cl...   

   img_count  link_count                                              title  \
0          1          22  NASA's Grand Challenge: Stop Asteroids From De...   
1          2          18  Google's New Open Source Patent Pledge: We Won...   
2          2          11  Ballin': 2014 NFL Draft Picks Get to Choose Th...   
3          1          13        Cameraperson Fails Deliver Slapstick Laughs   
4         52          16  NFL Star Helps Young Fan Prove Friendship With...   

                                          categories  categories_count  \
0  [asteroid, aste

In [8]:
def extract_timedata(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    time_tag = soup.find('time', datetime=True)
    if time_tag is None:
        return -1
    datetime_str = time_tag['datetime']
    datetime_obj = datetime.strptime(datetime_str, "%a, %d %b %Y %H:%M:%S %z")

    day_of_week = datetime_obj.weekday()
    year = datetime_obj.year
    month = datetime_obj.month
    day = datetime_obj.day
    hour = datetime_obj.hour
    minutes = datetime_obj.minute
    sec = datetime_obj.second
    timezone = datetime_obj.tzinfo.tzname(datetime_obj)

    return day_of_week, year, month, day, hour, minutes, sec, timezone

df[['Weekday', 'Year', 'Month', 'Day', 'Hour', 'Minutes', 'Sec', 'Timezone']] = df['Page content'].apply(extract_timedata).apply(pd.Series)

In [9]:
print(df.head(5))

   Id  Popularity                                       Page content  \
0   0          -1  <html><head><div class="article-info"> <span c...   
1   1           1  <html><head><div class="article-info"><span cl...   
2   2           1  <html><head><div class="article-info"><span cl...   
3   3          -1  <html><head><div class="article-info"><span cl...   
4   4          -1  <html><head><div class="article-info"><span cl...   

   img_count  link_count                                              title  \
0          1          22  NASA's Grand Challenge: Stop Asteroids From De...   
1          2          18  Google's New Open Source Patent Pledge: We Won...   
2          2          11  Ballin': 2014 NFL Draft Picks Get to Choose Th...   
3          1          13        Cameraperson Fails Deliver Slapstick Laughs   
4         52          16  NFL Star Helps Young Fan Prove Friendship With...   

                                          categories  categories_count  \
0  [asteroid, aste

In [10]:
def extract_author_name(html):
    pattern = r'<a href="/author/([^/]+)/">'
    match = re.search(pattern, html)
    if match:
        return match.group(1)
    else:
        text = BeautifulSoup(html, 'html.parser')
        # find the tag with "byline basic" class
        tag = text.find(class_='byline basic')
        # extract the text from the tag
        if tag:
            # use strip() to remove leading and trailing spaces
            tag = tag.get_text().strip()
            # remove the "By " or "by " substring
            tag = re.sub(r'^By ', '', tag)
            tag = re.sub(r'^by ', '', tag)
            # if len(tag)>=2, use only the first two words
            if len(tag.split()) >= 2:
                tag = ' '.join(tag.split()[:2])
                # for the second word, remove the trailing comma and number
                tag = re.sub(r',\s*\d+$', '', tag)
                # remove content that has number
                tag = re.sub(r'\d+', '', tag)
                # remove the trailing --
                tag = re.sub(r'--$', '', tag)
            return tag
        else:
            return 'Unknown'
df['author_name'] = df['Page content'].apply(extract_author_name)

In [11]:
print(df.head(5))

   Id  Popularity                                       Page content  \
0   0          -1  <html><head><div class="article-info"> <span c...   
1   1           1  <html><head><div class="article-info"><span cl...   
2   2           1  <html><head><div class="article-info"><span cl...   
3   3          -1  <html><head><div class="article-info"><span cl...   
4   4          -1  <html><head><div class="article-info"><span cl...   

   img_count  link_count                                              title  \
0          1          22  NASA's Grand Challenge: Stop Asteroids From De...   
1          2          18  Google's New Open Source Patent Pledge: We Won...   
2          2          11  Ballin': 2014 NFL Draft Picks Get to Choose Th...   
3          1          13        Cameraperson Fails Deliver Slapstick Laughs   
4         52          16  NFL Star Helps Young Fan Prove Friendship With...   

                                          categories  categories_count  \
0  [asteroid, aste

In [None]:
# maybe need it, not sure
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [12]:
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

df['title'] = df['title'].apply(tokenizer_stem_nostop)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mygodimatomato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import string

def remove_punctuations(lst):
    punctuations = set(string.punctuation)
    new_list = []

    for item in lst:
        new_item = ''.join([char for char in item if char not in punctuations])
        new_list.append(new_item)

    return new_list

df['title'] = df['title'].apply(remove_punctuations)

In [14]:
df['title_len'] = df['title'].apply(len)

In [15]:
print(df.head(5))

   Id  Popularity                                       Page content  \
0   0          -1  <html><head><div class="article-info"> <span c...   
1   1           1  <html><head><div class="article-info"><span cl...   
2   2           1  <html><head><div class="article-info"><span cl...   
3   3          -1  <html><head><div class="article-info"><span cl...   
4   4          -1  <html><head><div class="article-info"><span cl...   

   img_count  link_count                                              title  \
0          1          22  [nasa, grand, challenge, stop, asteroid, from,...   
1          2          18  [google, new, open, sourc, patent, pledge, we,...   
2          2          11  [ballin, nfl, draft, pick, get, choos, their, ...   
3          1          13      [cameraperson, fail, deliv, slapstick, laugh]   
4         52          16  [nfl, star, help, young, fan, prove, friendshi...   

                                          categories  categories_count  \
0  [asteroid, aste

In [16]:
df.to_csv('../datasets_processed/train_processed_2.csv', index=False)

In [18]:
df = pd.read_csv('../datasets/test.csv')
df[['img_count', 'link_count', 'title', 'categories', 'categories_count', 'channel']] = df['Page content'].apply(attribute_attract).apply(pd.Series)
df[['Weekday', 'Year', 'Month', 'Day', 'Hour', 'Minutes', 'Sec', 'Timezone']] = df['Page content'].apply(extract_timedata).apply(pd.Series)
df['author_name'] = df['Page content'].apply(extract_author_name)
df['title'] = df['title'].apply(tokenizer_stem_nostop)
df['title'] = df['title'].apply(remove_punctuations)
df['title_len'] = df['title'].apply(len)
df.to_csv('../datasets_processed/test_processed_2.csv', index=False)

print(df.head(5))

      Id                                       Page content  img_count  \
0  27643  <html><head><div class="article-info"><span cl...          1   
1  27644  <html><head><div class="article-info"><span cl...          3   
2  27645  <html><head><div class="article-info"><span cl...          2   
3  27646  <html><head><div class="article-info"><span cl...          1   
4  27647  <html><head><div class="article-info"><span cl...          1   

   link_count                                              title  \
0          30  [soccer, star, get, twitter, death, threat, af...   
1          13              [googl, glass, get, accessori, store]   
2          13   [ouya, game, consol, alreadi, sold, out, amazon]   
3          15                  [two, ferns, mock, oscar, nomine]   
4          10  [sniper, trailer, look, like, eastwood, may, b...   

                                          categories  categories_count  \
0  [soccer, twitter, one-direction, entertainment...                 8  