In [5]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

import pandas as pd
import re 

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split

from bs4 import BeautifulSoup
from datetime import datetime

In [6]:
df = pd.read_csv('../datasets/train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [7]:
def author_preprocess(text):
    text = re.sub('By', '', text)
    text = re.sub('by', '', text)
    text = re.sub(',', ' ,', text)
    text = re.sub(' and ', ' , ', text)
    text = re.sub('&', ',', text)
    return text

In [8]:
def attribute_attract(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # extract images
    images = soup.find_all('img')
    # extract links
    links = soup.find_all('a')
    
    # extract title
    h1_tag = soup.find('h1', {'class': 'title'})
    title = ""
    if h1_tag is not None:
        title = h1_tag.text
    title_bit = len(title)
    words = title.split()
    title_word_count = len(words)
    title_bit_count = title_bit - title_word_count + 1

    # extract content
    content = soup.find('section', {'class':'article-content'}).text
    content_len = len(content)
    content_words = len(content.split())
    content = re.sub('topics: ', '', content.lower())
    content = re.sub(',', ' ,', content)

    # extract categories 
    categories = []

        # Find the <a> tags
    for a_tag in soup.find_all('a', href=True):
        # Extract the href attribute
        href_value = a_tag['href']
        
        # Use regex to extract the category
        match = re.search(r'category/(.*)/', href_value)
        
        if match:
            categories.append(match.group(1))
    
    # extract channel
    channel_tag = soup.find('meta', {'property': 'article:section'})
    article_tag = soup.find('article', {'data-channel': True})
    channel = article_tag['data-channel']

    article_info = soup.head.find('div', {'class': 'article-info'})
    author = article_info.find('span', {'class': 'author_name'})
    if author != None:
        author = author.get_text()
    elif article_info.span != None:
        author = article_info.span.string
    else:
        author = article_info.a.string
    author = author_preprocess(author)

    # time data
    time_tag = soup.find('time', datetime=True)
    if time_tag is None:
        return -1
    datetime_str = time_tag['datetime']
    datetime_obj = datetime.strptime(datetime_str, "%a, %d %b %Y %H:%M:%S %z")

    day_of_week = datetime_obj.weekday()
    year = datetime_obj.year
    month = datetime_obj.month
    day = datetime_obj.day
    hour = datetime_obj.hour
    minutes = datetime_obj.minute
    sec = datetime_obj.second
    timezone = datetime_obj.tzinfo.tzname(datetime_obj)

    return len(images), len(links), title, title_word_count, title_bit_count, content, content_len, content_words, categories, len(categories), channel, author, day_of_week, year, month, day, hour, minutes, sec, timezone

df[['img_count', 'link_count', 'title', 'title_word_count', 'title_bit_count', 'content', 'content_len', 'content_word_count', 'categories', 'categories_count', 'channel', 'author', 'Weekday', 'Year', 'Month', 'Day', 'Hour', 'Minutes', 'Sec', 'Timezone']] = df['Page content'].apply(attribute_attract).apply(pd.Series)
print(df.head(5))

   Id  Popularity                                       Page content  \
0   0          -1  <html><head><div class="article-info"> <span c...   
1   1           1  <html><head><div class="article-info"><span cl...   
2   2           1  <html><head><div class="article-info"><span cl...   
3   3          -1  <html><head><div class="article-info"><span cl...   
4   4          -1  <html><head><div class="article-info"><span cl...   

   img_count  link_count                                              title  \
0          1          22  NASA's Grand Challenge: Stop Asteroids From De...   
1          2          18  Google's New Open Source Patent Pledge: We Won...   
2          2          11  Ballin': 2014 NFL Draft Picks Get to Choose Th...   
3          1          13        Cameraperson Fails Deliver Slapstick Laughs   
4         52          16  NFL Star Helps Young Fan Prove Friendship With...   

   title_word_count  title_bit_count  \
0                 8               53   
1           

In [9]:
# maybe need it, not sure
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [10]:
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

df['title'] = df['title'].apply(tokenizer_stem_nostop)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mygodimatomato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import string

def remove_punctuations(lst):
    punctuations = set(string.punctuation)
    new_list = []

    for item in lst:
        new_item = ''.join([char for char in item if char not in punctuations])
        new_list.append(new_item)

    return new_list

df['title'] = df['title'].apply(remove_punctuations)

In [12]:
print(df.head(5))

   Id  Popularity                                       Page content  \
0   0          -1  <html><head><div class="article-info"> <span c...   
1   1           1  <html><head><div class="article-info"><span cl...   
2   2           1  <html><head><div class="article-info"><span cl...   
3   3          -1  <html><head><div class="article-info"><span cl...   
4   4          -1  <html><head><div class="article-info"><span cl...   

   img_count  link_count                                              title  \
0          1          22  [nasa, grand, challenge, stop, asteroid, from,...   
1          2          18  [google, new, open, sourc, patent, pledge, we,...   
2          2          11  [ballin, nfl, draft, pick, get, choos, their, ...   
3          1          13      [cameraperson, fail, deliv, slapstick, laugh]   
4         52          16  [nfl, star, help, young, fan, prove, friendshi...   

   title_word_count  title_bit_count  \
0                 8               53   
1           

In [13]:
df.to_csv('../datasets_processed/train_processed_2.csv', index=False)

In [14]:
df = pd.read_csv('../datasets/test.csv')
df[['img_count', 'link_count', 'title', 'title_word_count', 'title_bit_count', 'content', 'content_len', 'content_word_count', 'categories', 'categories_count', 'channel', 'author', 'Weekday', 'Year', 'Month', 'Day', 'Hour', 'Minutes', 'Sec', 'Timezone']] = df['Page content'].apply(attribute_attract).apply(pd.Series)
# df['title'] = df['title'].apply(tokenizer_stem_nostop)
# df['title'] = df['title'].apply(remove_punctuations)
df.to_csv('../datasets_processed/test_processed_2.csv', index=False)

print(df.head(5))

      Id                                       Page content  img_count  \
0  27643  <html><head><div class="article-info"><span cl...        1.0   
1  27644  <html><head><div class="article-info"><span cl...        3.0   
2  27645  <html><head><div class="article-info"><span cl...        2.0   
3  27646  <html><head><div class="article-info"><span cl...        1.0   
4  27647  <html><head><div class="article-info"><span cl...        1.0   

   link_count                                              title  \
0        30.0  Soccer Star Gets Twitter Death Threats After T...   
1        13.0               Google Glass Gets an Accessory Store   
2        13.0     OUYA Gaming Console Already Sold Out on Amazon   
3        15.0           'Between Two Ferns' Mocks Oscar Nominees   
4        10.0  'American Sniper' Trailer: Looks Like Eastwood...   

   title_word_count  title_bit_count  \
0              11.0             64.0   
1               6.0             31.0   
2               8.0       