# Feature Engineering
Here, in this notebook, we will 

1. Create our final labels
2. Do some basic feature engineering 

In [1]:
import csv
import os
import pandas as pd
import re
import sys

from collections import Counter
from loguru import logger
from pathlib import Path
from time import time, strftime, gmtime

In [2]:
# Configuring the logger
config = {"handlers": [{"sink": sys.stdout,"colorize": True,
          "format": "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"}]}
logger.configure(**config)

[1]

In [3]:
# Data Folder
data_folder = Path.home() / 'Data' / 'cc_news'

## 1. Getting our keywords from each document 

In [4]:

start_time = time()
with open(data_folder / 'spark_output' / 'df_final.csv', 'r') as csv_file:
    reader = csv.DictReader(csv_file)
    keyword_counter = Counter()
    for i, row in enumerate(reader):
        keywords = row['keywords']
        if keywords != "" or keywords is not None:
            try:
                split_keywords = re.split('[,;-]', keywords)
            except TypeError:
                pass
            for k in split_keywords:
                if k != '' and len(k)>2:
                    keyword_counter.update({k.strip().lower(): 1})
elapsed = strftime("%H:%M:%S", gmtime(time() - start_time))
logger.info(f'It took {elapsed} to run this script!')
os.system("play /usr/share/sounds/sound-icons/trumpet-1.wav")

[32m2020-01-12 22:47:55[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mIt took 00:00:05 to run this script![0m


0

In [5]:
# Manually Create a short list of main topic keywords that we will use as a lookup
main_topics = {("government / politics", "politics", "government", "us government news", "government news"): "politics",
               ("economics", "economy", "trade", "business", "company news","finance", "market reports",
                "stock", "stocks" "company", "companies", "tech", "technology",
                "mergers / acquisitions / takeovers", "corporate events", "equity markets",
                "international trade","equities markets", "markets", "market", "emerging market countries"): "business_economy",
               ("world","world news", "europe", "western europe","asia", "asia / pacific", 
                "africa","west africa","east africa", "southern africa", "european union"
                "latin america", "americas", "mid east", "middle east"): "world",
                ("entertainment","entertainment news", "celebrity", "gossip", "music"): "entertainment",
                ('sport', 'sports', 'league', 'game', 'soccer', 'basketball', 'football', 'games', 'score', 'scores', "nfl", "premier league"): "sports",
              }

In [6]:
# Here we are going to get the most comment key words that co-exists with the main ones above.
# We will throw them into a counter based in the key  
# hard label each document if at least 2 of the keywords exist in the list above
start_time = time()
with open(data_folder / 'spark_output' / 'df_final.csv', 'r') as csv_file:
    reader = csv.DictReader(csv_file)
    counters = {"politics" : Counter(), 
                "world": Counter(),
                "business_economy":Counter(),
                'sports':Counter(),
                'entertainment':Counter(),
                'other': Counter()
               }
    
    exclude = ["news", 'local', "general news", "united states", "travel",
              "breaking news", "breaking", "major events", "major news", 
               "current events", "latest news", "current_news", "picture available", "pictures", "picture"]
    main_topic_kw = [k for key in main_topics.keys() for k in key] + exclude
    for row in reader:
        keywords = row['keywords']
        if keywords != "" or keywords is not None:
            try:
                split_keywords = re.split('[,;-]', keywords)
                split_keywords = [s.strip().lower() for s in split_keywords]
                for k, v in main_topics.items():
                    if len(set(split_keywords).intersection(set(k))) > 1:
                        for kw in split_keywords:
                            if kw != '' and len(kw)>2 and kw not in main_topic_kw:
                                counters[v].update({kw: 1})
                    else:
                        for kw in split_keywords:
                            if kw != '' and len(kw)>2 and kw not in main_topic_kw:
                                counters['other'].update({kw: 1})

            except TypeError:
                pass
    
elapsed = strftime("%H:%M:%S", gmtime(time() - start_time))
logger.info(f'It took {elapsed} to run this script!')
os.system("play /usr/share/sounds/sound-icons/trumpet-1.wav") # Beeps an alert. May not run on your machine

[32m2020-01-12 22:50:42[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mIt took 00:00:10 to run this script![0m


0

In [8]:
# Get print out of the counters
for k, counter in counters.items():
    print(k, counters[k].most_common(30), "\n")

politics [('usa', 209), ('diplomacy / foreign policy', 145), ('us house of representatives', 137), ('international / national security', 114), ('trump', 113), ('judicial process / court cases / court decisions', 113), ('impeachment', 107), ('donald trump', 104), ('us senate', 103), ('ukraine', 102), ('lawmaking', 101), ('crime / law / justice', 100), ('crime', 92), ('russia', 90), ('elections / voting', 64), ('presidential elections', 61), ('exclusive', 46), ('video', 45), ('nancy pelosi', 42), ('graphics', 41), ('stadium', 40), ('european union', 39), ('match', 38), ('biofuels', 36), ('region', 34), ('message', 32), ('health / medicine', 31), ('oil and gas (trbc)', 30), ('conflicts / war / peace', 30), ('joe biden', 30)] 

world [('united kingdom', 256), ('france', 162), ('european union', 152), ('china (prc)', 151), ('diplomacy / foreign policy', 151), ('germany', 145), ('financials (legacy)', 113), ('japan', 110), ('euro zone', 108), ('financials (trbc)', 104), ('italy', 104), ('jud

Seems legit!

## 2. Labeling our documents

In [9]:
# Switching to pandas: Reading our data
df_final = pd.read_csv(data_folder / 'spark_output' / 'df_final.csv')

In [12]:
# This function will create a hard label for each document if at least 2 keywords exist in the keyword list
def label_records_via_keywords(keywords):
    if keywords != "" or keywords is not None:
        try:
            split_keywords = re.split('[,;-]', keywords)
            split_keywords = [s.strip().lower() for s in split_keywords]
            for k, v in main_topics.items():
                if len(set(split_keywords).intersection(set(k))) > 1:
                    return v
        except:
            return None

In [13]:
df_final.head()

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text
0,https://chicago.cbslocal.com/2019/12/18/man-st...,sports,Man Stabbed After His Granddaughter And Three ...,A man's granddaughter is one of four teens acc...,"grandfather, granddaughter, teen, teens, home ...","grandfather, granddaughter, teen, teens, home ...",Man Stabbed After His Granddaughter And Three ...
1,http://www.releasewire.com/press-releases/rele...,news,Commence Industrial CRM Achieves Sales Optimiz...,,,,Commence Industrial CRM Achieves Sales Optimiz...
2,https://www.kark.com/news/national-news/1-dead...,news,"1 dead, 2 injured in Oregon shopping center st...",Multiple people were stabbed at the Murray Hil...,,,"1 dead, 2 injured in Oregon shopping center st..."
3,https://www.ii.co.uk/etfs/sg-wti-x2-daily-long...,financial_news,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...,Real-time share price updates and latest news ...,,,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...
4,http://www.peicanada.com/island_deaths/frances...,news,Frances Salsman | Island Deaths | peicanada.com,"SALSMAN, Frances (Fran) Townsend At the Prince...","island_deaths, death_notices","island_deaths, death_notices",Frances Salsman | Island Deaths | peicanada.co...


In [14]:
# Apply Our Function to each row
df_final['kw_label'] = df_final.apply(lambda row: label_records_via_keywords(row['keywords']), axis=1)

In [15]:
df_final[~df_final.kw_label.isna()]

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
64,https://www.nwitimes.com/news/local/govt-and-p...,news,Visclosky set to vote today for both impeachme...,U.S. Rep. Pete Visclosky will join the Democra...,", indiana, national-government, local-governme...",", indiana, national-government, local-governme...",Visclosky set to vote today for both impeachme...,politics
199,https://www.news4jax.com/news/politics/2019/12...,news,Sports betting's rapid expansion faces more te...,The line for the ticket windows at a Rhode Isl...,"Sports,politics,Government",,Sports betting's rapid expansion faces more te...,politics
302,https://www.reuters.com/article/us-basketball-...,news,\n Suns' Ayton begins second ac...,Phoenix Suns big man Deandre Ayton is again in...,"US,BASKETBALL,NBA,PHX,AYTON,Bahamas,Americas,S...",US;BASKETBALL;NBA;PHX;AYTON;Bahamas;Americas;S...,Suns' Ayton begins second act after suspension...,sports
309,https://www.reuters.com/article/us-usa-biofuel...,news,\n White House says it is stick...,The Trump administration plans to stick with i...,"US,USA,BIOFUELS,EXCLUSIVE,Biofuels,Advocacy Gr...",US;USA;BIOFUELS;EXCLUSIVE;Biofuels;Advocacy Gr...,White House says it is sticking with 2020 biof...,politics
342,https://coed.com/2019/11/26/dallas-cowboys-hea...,entertainment,Dallas Cowboys Head Coaching Odds 2019: Who Wi...,Who will be the Dallas Cowboys head coach next...,"Sports,Sports,Betting Odds,dallas-cowboys,NFL","Sports,Sports,Betting Odds,dallas-cowboys,NFL",Dallas Cowboys Head Coaching Odds 2019: Who Wi...,sports
...,...,...,...,...,...,...,...,...
163132,https://www.redandblack.com/sports/georgia-foo...,news,Georgia football signs 5-star outside lineback...,Mekhail Sherman signed his letter of intent to...,"mekhail sherman, linebacker, georgia, sport, b...","mekhail sherman, linebacker, georgia, sport, b...",Georgia football signs 5-star outside lineback...,sports
163221,https://6abc.com/sports/eagles-ready-to-take-o...,news,"Philadelphia Eagles ready to take on Cowboys, ...","""Embarrassing!"" That's the word Carson Wentz, ...","Philadelphia eagles, eagles, nfl, football, Da...",,"Philadelphia Eagles ready to take on Cowboys, ...",sports
163319,https://madison.com/opinion/letters/coach-gard...,news,Coach Gard must right sinking ship -- Hal Wiss...,What’s going on with the Wisconsin men’s baske...,", letter to editor, commentary, greg gard, bas...",", letter to editor, commentary, greg gard, bas...",Coach Gard must right sinking ship -- Hal Wiss...,sports
163328,https://qctimes.com/sports/high-school/basketb...,news,Central DeWitt remains No. 1 in Area Fab 5 | H...,AREA FAB 5,", northeast, central dewitt, game, sport, albu...",", northeast, central dewitt, game, sport, albu...",Central DeWitt remains No. 1 in Area Fab 5 | H...,sports


In [16]:
# Sampling for inspection
df_final[df_final.kw_label=='sports'].sample(10)

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
73277,https://trib.com/sports/college/wyoming/footba...,financial_news,2020 Wyoming signee: Braden Siders | Football ...,Wyoming football got a signed National Letter ...,", college-sports, wyoming cowboys football, re...",", college-sports, wyoming cowboys football, re...",2020 Wyoming signee: Braden Siders | Football ...,sports
107446,https://www.reuters.com/article/us-basketball-...,news,\n Knicks hire Blatt in advisor...,"David Blatt, who led the Cleveland Cavaliers t...","US,BASKETBALL,NBA,NYK,BLATT,Sport,Basketball,N...",US;BASKETBALL;NBA;NYK;BLATT;Sport;Basketball;N...,Knicks hire Blatt in advisory role - Reuters F...,sports
17083,https://www.union-bulletin.com/seattle_times/k...,sports,"Koprivica, Olejniczak power Florida State past...","TALLAHASSEE, Fla. (AP) — Balsa Koprivica score...","florida, koprivica, dominik olejniczak, sport,...","florida, koprivica, dominik olejniczak, sport,...","Koprivica, Olejniczak power Florida State past...",sports
73687,https://cumberlink.com/sports/high-school/bask...,news,HS Girls Basketball Highlights: Cedar Cliff dr...,Cedar Cliff dropped Lower Dauphin 44-39 in OT ...,", prep-sports, cliff, dauphin, sport, trey, ot...",", prep-sports, cliff, dauphin, sport, trey, ot...",HS Girls Basketball Highlights: Cedar Cliff dr...,sports
121709,https://madison.com/sports/baseball/profession...,news,Eric Sogard agrees to 1-year deal with Brewers...,"Sogard, a utility player who played for Milwau...",", pro-sports, pro-baseball, mlb, pro baseball,...",", pro-sports, pro-baseball, mlb, pro baseball,...",Eric Sogard agrees to 1-year deal with Brewers...,sports
129879,https://www.union-bulletin.com/seattle_times/w...,sports,Washington men face early struggle before pull...,"In each of their past two meetings, Seattle Un...","isaiah stewart, seattle university, quade gree...","isaiah stewart, seattle university, quade gree...",Washington men face early struggle before pull...,sports
89946,https://www.sportingnews.com/ca/soccer/news/ba...,sports,'Bang up for another derby' - Rashford excited...,Marcus Rashford says he is 'bang up for anothe...,", Football, Premier League, News, Manchester C...",,'Bang up for another derby' - Rashford excited...,sports
158393,https://www.skysports.com/football/news/11938/...,sports,Carabao Cup hits and misses: Rashford strikes ...,,"Sky, Sports, capital, one , cup, carling,footb...",,Carabao Cup hits and misses: Rashford strikes ...,sports
72570,https://cumberlink.com/mid-penn-girls-basketba...,news,Mid-Penn Girls Basketball results for Dec. 17 ...,Commonwealth Division,", prep-sports, james buchanan, dauphin, altoon...",", prep-sports, james buchanan, dauphin, altoon...",Mid-Penn Girls Basketball results for Dec. 17 ...,sports
93257,https://www.skysports.com/football/news/11095/...,sports,Mesut Ozil vs China: US Secretary of State bac...,The United States secretary of state Mike Pomp...,"News,Football,Sky,Sports,Sky Sports,sport,news",,Mesut Ozil vs China: US Secretary of State bac...,sports


## 3. Basic Cleaning Text

In [355]:
# Filling NAs
df_final['text'] = df_final.text.fillna('')

In [18]:
def make_printable(string):
    """
    Clean up Non-ASCII characters.

    Replace non-printable characters in a string.
    """
    no_print_trans_table = ''.join([chr(i) for i in range(128)] + [' '] * 128)
    
    try:
        trans = string.translate(no_print_trans_table)
    except AttributeError:
        trans = ""
    return trans

In [20]:

# Getting rid of non-ascii characters 
df_final['text'] = df_final["text"].apply(lambda text: make_printable(text))

In [21]:
df_final

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
0,https://chicago.cbslocal.com/2019/12/18/man-st...,sports,Man Stabbed After His Granddaughter And Three ...,A man's granddaughter is one of four teens acc...,"grandfather, granddaughter, teen, teens, home ...","grandfather, granddaughter, teen, teens, home ...",Man Stabbed After His Granddaughter And Three ...,
1,http://www.releasewire.com/press-releases/rele...,news,Commence Industrial CRM Achieves Sales Optimiz...,,,,Commence Industrial CRM Achieves Sales Optimiz...,
2,https://www.kark.com/news/national-news/1-dead...,news,"1 dead, 2 injured in Oregon shopping center st...",Multiple people were stabbed at the Murray Hil...,,,"1 dead, 2 injured in Oregon shopping center st...",
3,https://www.ii.co.uk/etfs/sg-wti-x2-daily-long...,financial_news,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...,Real-time share price updates and latest news ...,,,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...,
4,http://www.peicanada.com/island_deaths/frances...,news,Frances Salsman | Island Deaths | peicanada.com,"SALSMAN, Frances (Fran) Townsend At the Prince...","island_deaths, death_notices","island_deaths, death_notices",Frances Salsman | Island Deaths | peicanada.co...,
...,...,...,...,...,...,...,...,...
163428,https://www.bakingbusiness.com/articles/49996-...,financial_news,Dairy Farmers of America merges ingredients bu...,KANSAS CITY â Dairy Farmers of America (D.F....,"baking business, Baking & Snack, baking and sn...",,Dairy Farmers of America merges ingredients bu...,
163429,https://www.fark.com/comments/10653287?utm_sou...,news,FARK.com: (10653287) You can now watch the vi...,"Humorous views on interesting, bizarre and amu...","Fark, Fark.com, Drew Curtis, News, Community N...",,FARK.com: (10653287) You can now watch the vid...,
163430,https://www.mygrandeprairienow.com/65144/local...,news,Local man hoping to make holidays special by h...,,,,Local man hoping to make holidays special by h...,
163431,https://www.thehollywoodgossip.com/slideshows/...,entertainment,90 Day Fiance Before The 90 Days: Trailer Teas...,90 Day Fiance: Before The 90 Days just release...,,,90 Day Fiance Before The 90 Days: Trailer Teas...,


In [23]:
df_final.sample(1).iloc[0].text

"Veterans Ace nears completion | News | samessenger.com Thank you for Reading! Please log in, or sign up for a new account and purchase a subscription to continue reading. Sign Up Log In Purchase a Subscription Thank you for Reading! Please log in, or sign up for a new account and purchase a subscription to continue reading. Please purchase a subscription to continue reading. Your current subscription does not provide access to this content. Sign Up Log In Current e-Edition Subscribers Free access for current print subscribers Get Started Current Print Subscribers Free access for current print subscribers Get Started Franklin County's #1 Source for Local News Starting at $5.00 for 30 days Get Started View all rates Rate Price Duration One Month $5.00 for 30 days One Year $50.00 for 365 days You are the owner of this article. Edit Article Add New Article Subscribe | Sign In Home News Upbeat News Business Profiles Local History Sports Classifieds Employment Real Estate Property For Rent 

In [24]:
df_final.kw_label.value_counts()

business_economy    1438
sports              1199
politics             479
world                395
entertainment        158
Name: kw_label, dtype: int64

In [25]:
df_final[df_final.kw_label=='sports'].sample(10)

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
82356,https://lmtribune.com/sports/prep-roundup-lust...,news,PREP ROUNDUP: Lustig tallies 44 as Colfax boys...,BOYS BASKETBALL,"colfax, john lustig, sport, basketball, team, ...","colfax, john lustig, sport, basketball, team, ...",PREP ROUNDUP: Lustig tallies 44 as Colfax boys...,sports
118479,https://www.onenewspage.com/video/20191206/125...,news,Celtic v Spartak Moscow | Eurofan - One News P...,"Celtic v Spartak Moscow | Eurofan: Tom Deacon,...","Spartak,soccer,Ð¡Ð¿Ð°Ñ\x80Ñ\x82Ð°Ðº,Champions ...",,Celtic v Spartak Moscow | Eurofan - One News P...,sports
69366,https://www.fordcountyrecord.com/sports/pbl-fr...,news,PBL freshman boys basketball wins 49-19 over A...,DANVILLE -- The Paxton-Buckley-Loda freshman b...,"sports, prep-sports, boys-basketball","sports, prep-sports, boys-basketball",PBL freshman boys basketball wins 49-19 over A...,sports
147669,https://www.onenewspage.com/n/Technology/1zkl5...,news,The Oscars of the video game industry just cel...,Â· *The Game Awards 2019 celebrated the year's...,"oscars,video,game,industry,year,best,games,win...",,The Oscars of the video game industry just cel...,sports
73526,https://www.insidenova.com/sports/wakefield-sw...,news,Wakefield sweeps rivalry games | Sports | insi...,"Strong defense, especially in the second half,...","sports, arlington, wakefield, washington_lee, ...","sports, arlington, wakefield, washington_lee, ...",Wakefield sweeps rivalry games | Sports | insi...,sports
90582,https://www.eacourier.com/sports/eagles-apache...,news,"Eagles, Apaches set for EAC Tournament champio...","THATCHER — A sold-out gymnasium, close games, ...","game, sport, basketball, morenci, duncan, rebo...","game, sport, basketball, morenci, duncan, rebo...","Eagles, Apaches set for EAC Tournament champio...",sports
69787,https://www.heraldbulletin.com/pacers-rally-la...,news,Pacers rally late to snap Lakers' road winning...,INDIANAPOLIS — Domantas Sabonis scored 26 poin...,"indiana pacers, lakers, sport, basketball, mal...","indiana pacers, lakers, sport, basketball, mal...",Pacers rally late to snap Lakers' road winning...,sports
17759,https://lacrossetribune.com/sports/local/preps...,news,High school basketball: La Crosse Aquinas girl...,The Aquinas High School girls basketball team ...,", taylor theusch, aquinas, courtney becker, sp...",", taylor theusch, aquinas, courtney becker, sp...",High school basketball: La Crosse Aquinas girl...,sports
74320,https://www.idahostatejournal.com/sports/isu/t...,news,"Tough to swallow: ISU men lose late lead, fall...","POCATELLO — A late scoring drought, missed fre...","tarik cool, free throw, overtime, sport, baske...","tarik cool, free throw, overtime, sport, baske...","Tough to swallow: ISU men lose late lead, fall...",sports
79015,https://bleacherreport.com/articles/2867403-nf...,sports,"NFL Playoffs 2019-20: Bracket Predictions, Odd...",Four tickets to the 2019-20 NFL playoffs rem...,"Football, NFL, NFL Playoffs, Breaking News",,"NFL Playoffs 2019-20: Bracket Predictions, Odd...",sports


### Comments

Our basic labeling seems to have worked.. 

## 4. Converting to BERT format and Saving files 

In [27]:
file_path = Path.home() / 'Data' / 'cc_news' / 'model_input'
if not file_path.is_dir():
    Path.mkdir(file_path, parents=True, exist_ok=True)
    

In [28]:
# Saving to disk
df_final.to_csv(file_path / 'all_data.csv', header=True, index=False, na_rep="")

In [29]:
# Splitting our data into train and dev
from sklearn.model_selection import train_test_split
train, dev = train_test_split(df_final[~df_final.kw_label.isna()], test_size=0.1, random_state=42)

In [30]:
train.kw_label.value_counts()

business_economy    1300
sports              1085
politics             424
world                349
entertainment        144
Name: kw_label, dtype: int64

Basic file transformation for BERT

In [31]:
train = pd.DataFrame({'id': range(len(train.index)),
                          'label': train["kw_label"],
                          'alpha': ['a'] * train.shape[0],
                          'text': train["text"]
                          })
    
dev = pd.DataFrame({'id': range(len(dev.index)),
                    'label': dev['kw_label'],
                    'alpha': ['a'] * dev.shape[0],
                    'text': dev["text"]
                    })

In [33]:
train.to_csv(file_path / 'train.tsv', sep='\t', header=False, index=False)
dev.to_csv(file_path / 'dev.tsv', sep='\t', header=False, index=False)

We now have a good size corpus for training a BERT model and a very large "unlabeled" corpus

In [34]:
unlabeled = df_final[df_final.kw_label.isna()]
unlabeled = pd.DataFrame({'id': range(len(unlabeled)),
                            'label': None,
                            'alpha': ['a'] * unlabeled.shape[0],
                            'text': unlabeled['text']
                          })

In [35]:
# Save unlabeled documents as test
unlabeled.to_csv(file_path / 'test.tsv', sep='\t', header=False, index=False)

## End of Notebook

And we reached the end of another notebook. Here, we

1. Using the keywords from the meta data tags of the web pages, created class labels. 
2. Did some basic preprocessing for modeling 
3. Train and test split

In the next step, 
1. We will prepare the corpus for BERT (Input features)
2. Fine tune a BERT model using transfer learning
3. And classify our documents into their final categories. 