In [3]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

import pandas as pd
import re 

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split

from bs4 import BeautifulSoup

In [4]:
df = pd.read_csv('../datasets/train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [5]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [6]:
df['Page content'] = df['Page content'].apply(preprocessor)
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1   clara moskowitz for space com 2013 06 19 15 0...
1   1           1  by christina warren2013 03 28 17 40 55 utcgoog...
2   2           1  by sam laird2014 05 07 19 15 20 utcballin 2014...
3   3          -1  by sam laird2013 10 11 02 26 50 utccameraperso...
4   4          -1  by connor finnegan2014 04 17 03 31 43 utcnfl s...


In [7]:
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mygodimatomato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df['Page content'] = df['Page content'].apply(tokenizer_stem_nostop)
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  [clara, moskowitz, space, com, utc, nasa, gran...
1   1           1  [christina, warren2013, utcgoogl, new, open, s...
2   2           1  [sam, laird2014, utcballin, nfl, draft, pick, ...
3   3          -1  [sam, laird2013, utccameraperson, fail, deliv,...
4   4          -1  [connor, finnegan2014, utcnfl, star, help, you...


In [9]:
df['Popularity'].replace(-1, 0, inplace=True)
print(df.head(5)) 

   Id  Popularity                                       Page content
0   0           0  [clara, moskowitz, space, com, utc, nasa, gran...
1   1           1  [christina, warren2013, utcgoogl, new, open, s...
2   2           1  [sam, laird2014, utcballin, nfl, draft, pick, ...
3   3           0  [sam, laird2013, utccameraperson, fail, deliv,...
4   4           0  [connor, finnegan2014, utcnfl, star, help, you...


In [10]:
df.to_csv('../datasets_processed/train_processed.csv', index=False)

In [11]:
df = pd.read_csv('../datasets/test.csv')
df['Page content'] = df['Page content'].apply(preprocessor)
df['Page content'] = df['Page content'].apply(tokenizer_stem_nostop)
df.to_csv('../datasets_processed/test_processed.csv', index=False)