In [1]:
import pandas as pd
import re
from nltk.stem import wordnet, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from tqdm import tqdm_notebook as tqdm

## READ DATASET AND CREATE TEXT AND TARGET

In [2]:
dataset = pd.read_json('News_Category_Dataset_v2.json', lines=True)

# keep only text and target category
dataset = dataset.drop(['date', 'link', 'authors'], axis=1)
dataset.head(10)

Unnamed: 0,category,headline,short_description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ..."
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,"""It is not right to equate horrific incidents ..."
6,ENTERTAINMENT,Donald Trump Is Lovin' New McDonald's Jingle I...,"It's catchy, all right."
7,ENTERTAINMENT,What To Watch On Amazon Prime That’s New This ...,There's a great mini-series joining this week.
8,ENTERTAINMENT,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,"Myer's kids may be pushing for a new ""Powers"" ..."
9,ENTERTAINMENT,What To Watch On Hulu That’s New This Week,You're getting a recent Academy Award-winning ...


In [3]:
target = (dataset['category'].map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)).tolist()

In [4]:
target[0:4]

['CRIME', 'ENTERTAINMENT', 'ENTERTAINMENT', 'ENTERTAINMENT']

In [5]:
texts = (dataset['headline'] + ' ' + dataset['short_description']).tolist()

In [6]:
texts[0:4]

['There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV She left her husband. He killed their children. Just another day in America.',
 "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song Of course it has a song.",
 'Hugh Grant Marries For The First Time At Age 57 The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.',
 "Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork The actor gives Dems an ass-kicking for not fighting hard enough against Donald Trump."]

## TEXT CLEANING

In [7]:
patternAlphaNum = re.compile('[\W_]+')
stops = stopwords.words('english')
wn = WordNetLemmatizer()

# helper function to transform pos-tags
# this is ridiculous btw
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def clean_text(text, pattern=patternAlphaNum, stops=stops):
    # split to words and lowercase
    words = [word.lower() for word in text.split()]
    
    # keep only alphanumeric characters
    words_alphanum = [pattern.sub('', word) for word in words]
    
    # remove stopwords
    words_notstop = [word for word in words_alphanum if not ((word in stops) or (word == ''))]
    
    # lemmatize using pos tags
    pos_tags = [get_wordnet_pos(pos[1]) for pos in pos_tag(words_notstop)]
    return ' '.join([wn.lemmatize(word, pos=pos_tag) for word, pos_tag in zip(words_notstop, pos_tags)])

In [8]:
texts_cleaned = [clean_text(text) for text in tqdm(texts)]    

HBox(children=(IntProgress(value=0, max=200853), HTML(value='')))




In [18]:
# combine them to 1 text
all_cleaned_text = '\n'.join(texts_cleaned)

# write to file
with open('cleaned_text/cleaned_text.txt', 'wb') as f:
    f.write(all_cleaned_text.encode('utf-8'))

## FEATURE ENGINEERING

In [23]:
# load text
with open('cleaned_text/cleaned_text.txt', 'rb') as f:
    all_cleaned_text = [line.decode('utf-8').strip('\n') for line in f.readlines()]