In [170]:
import csv
import os
import pandas as pd
import re
import sys

from collections import Counter
from loguru import logger
from pathlib import Path
from time import time, strftime, gmtime

In [4]:
# Configuring the logger
config = {"handlers": [{"sink": sys.stdout,"colorize": True,
          "format": "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"}]}
logger.configure(**config)

[1]

In [11]:
# Data Folder
data_folder = Path.home() / 'Data' / 'cc_news'

In [28]:
start_time = time()
with open(data_folder / 'spark_output' / 'df_final.csv', 'r') as csv_file:
    reader = csv.DictReader(csv_file)
    keyword_counter = Counter()
    for i, row in enumerate(reader):
        keywords = row['keywords']
        if keywords != "" or keywords is not None:
            try:
                split_keywords = re.split('[,;-]', keywords)
            except TypeError:
                pass
            for k in split_keywords:
                if k != '' and len(k)>2:
                    keyword_counter.update({k.strip().lower(): 1})
elapsed = strftime("%H:%M:%S", gmtime(time() - start_time))
logger.info(f'It took {elapsed} to run this script!')
os.system("play /usr/share/sounds/sound-icons/trumpet-1.wav") # Beeps an alert. May not run on your machine

[32m2020-01-08 21:54:48[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mIt took 00:00:05 to run this script![0m


0

In [328]:
# Create a short list of main topic keywords that we will use as a lookup
main_topics = {("government / politics", "politics", "government", "us government news", "government news"): "politics",
               ("economics", "economy", "trade", "business", "company news","finance", "market reports",
                "stock", "stocks" "company", "companies", "tech", "technology",
                "mergers / acquisitions / takeovers", "corporate events", "equity markets",
                "international trade","equities markets", "markets", "market", "emerging market countries"): "business_economy",
               ("world","world news", "europe", "western europe","asia", "asia / pacific", 
                "africa","west africa","east africa", "southern africa", "european union"
                "latin america", "americas", "mid east", "middle east"): "world",
                ("entertainment","entertainment news", "celebrity", "gossip", "music"): "entertainment",
                ('sport', 'sports', 'league', 'game', 'soccer', 'basketball', 'football', 'games', 'score', 'scores', "nfl", "premier league"): "sports",
              }

In [329]:
start_time = time()
with open(data_folder / 'spark_output' / 'df_final.csv', 'r') as csv_file:
    reader = csv.DictReader(csv_file)
    counters = {"politics" : Counter(), 
                "world": Counter(),
                "business_economy":Counter(),
                'sports':Counter(),
                'entertainment':Counter(),
                'other': Counter()
               }
    
    exclude = ["news", 'local', "general news", "united states", "travel",
              "breaking news", "breaking", "major events", "major news", 
               "current events", "latest news", "current_news", "picture available", "pictures", "picture"]
    main_topic_kw = [k for key in main_topics.keys() for k in key] + exclude
    for row in reader:
        keywords = row['keywords']
        if keywords != "" or keywords is not None:
            try:
                split_keywords = re.split('[,;-]', keywords)
                split_keywords = [s.strip().lower() for s in split_keywords]
                for k, v in main_topics.items():
                    if len(set(split_keywords).intersection(set(k))) > 1:
                        for kw in split_keywords:
                            if kw != '' and len(kw)>2 and kw not in main_topic_kw:
                                counters[v].update({kw: 1})
                    else:
                        for kw in split_keywords:
                            if kw != '' and len(kw)>2 and kw not in main_topic_kw:
                                counters['other'].update({kw: 1})

            except TypeError:
                pass
    
elapsed = strftime("%H:%M:%S", gmtime(time() - start_time))
logger.info(f'It took {elapsed} to run this script!')
os.system("play /usr/share/sounds/sound-icons/trumpet-1.wav") # Beeps an alert. May not run on your machine

[32m2020-01-09 16:42:26[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mIt took 00:00:10 to run this script![0m


0

In [330]:
for k, counter in counters.items():
    print(k, counters[k].most_common(30), "\n")

politics [('usa', 209), ('diplomacy / foreign policy', 145), ('us house of representatives', 137), ('international / national security', 114), ('trump', 113), ('judicial process / court cases / court decisions', 113), ('impeachment', 107), ('donald trump', 104), ('us senate', 103), ('ukraine', 102), ('lawmaking', 101), ('crime / law / justice', 100), ('crime', 92), ('russia', 90), ('elections / voting', 64), ('presidential elections', 61), ('exclusive', 46), ('video', 45), ('nancy pelosi', 42), ('graphics', 41), ('stadium', 40), ('european union', 39), ('match', 38), ('biofuels', 36), ('region', 34), ('message', 32), ('health / medicine', 31), ('oil and gas (trbc)', 30), ('conflicts / war / peace', 30), ('joe biden', 30)] 

world [('united kingdom', 256), ('france', 162), ('european union', 152), ('china (prc)', 151), ('diplomacy / foreign policy', 151), ('germany', 145), ('financials (legacy)', 113), ('japan', 110), ('euro zone', 108), ('financials (trbc)', 104), ('italy', 104), ('jud

In [347]:
df_final = pd.read_csv(data_folder / 'spark_output' / 'df_final.csv')

In [348]:
def label_records_via_keywords(keywords):
    if keywords != "" or keywords is not None:
        try:
            split_keywords = re.split('[,;-]', keywords)
            split_keywords = [s.strip().lower() for s in split_keywords]
            for k, v in main_topics.items():
                if len(set(split_keywords).intersection(set(k))) > 1:
                    return v
        except:
            return None

In [349]:
df_final.head()

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text
0,https://chicago.cbslocal.com/2019/12/18/man-st...,sports,Man Stabbed After His Granddaughter And Three ...,A man's granddaughter is one of four teens acc...,"grandfather, granddaughter, teen, teens, home ...","grandfather, granddaughter, teen, teens, home ...",Man Stabbed After His Granddaughter And Three ...
1,http://www.releasewire.com/press-releases/rele...,news,Commence Industrial CRM Achieves Sales Optimiz...,,,,Commence Industrial CRM Achieves Sales Optimiz...
2,https://www.kark.com/news/national-news/1-dead...,news,"1 dead, 2 injured in Oregon shopping center st...",Multiple people were stabbed at the Murray Hil...,,,"1 dead, 2 injured in Oregon shopping center st..."
3,https://www.ii.co.uk/etfs/sg-wti-x2-daily-long...,financial_news,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...,Real-time share price updates and latest news ...,,,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...
4,http://www.peicanada.com/island_deaths/frances...,news,Frances Salsman | Island Deaths | peicanada.com,"SALSMAN, Frances (Fran) Townsend At the Prince...","island_deaths, death_notices","island_deaths, death_notices",Frances Salsman | Island Deaths | peicanada.co...


In [350]:
df_final['kw_label'] = df_final.apply(lambda row: label_records_via_keywords(row['keywords']), axis=1)

In [351]:
df_final[~df_final.kw_label.isna()]

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
64,https://www.nwitimes.com/news/local/govt-and-p...,news,Visclosky set to vote today for both impeachme...,U.S. Rep. Pete Visclosky will join the Democra...,", indiana, national-government, local-governme...",", indiana, national-government, local-governme...",Visclosky set to vote today for both impeachme...,politics
199,https://www.news4jax.com/news/politics/2019/12...,news,Sports betting's rapid expansion faces more te...,The line for the ticket windows at a Rhode Isl...,"Sports,politics,Government",,Sports betting's rapid expansion faces more te...,politics
302,https://www.reuters.com/article/us-basketball-...,news,\n Suns' Ayton begins second ac...,Phoenix Suns big man Deandre Ayton is again in...,"US,BASKETBALL,NBA,PHX,AYTON,Bahamas,Americas,S...",US;BASKETBALL;NBA;PHX;AYTON;Bahamas;Americas;S...,Suns' Ayton begins second act after suspension...,sports
309,https://www.reuters.com/article/us-usa-biofuel...,news,\n White House says it is stick...,The Trump administration plans to stick with i...,"US,USA,BIOFUELS,EXCLUSIVE,Biofuels,Advocacy Gr...",US;USA;BIOFUELS;EXCLUSIVE;Biofuels;Advocacy Gr...,White House says it is sticking with 2020 biof...,politics
342,https://coed.com/2019/11/26/dallas-cowboys-hea...,entertainment,Dallas Cowboys Head Coaching Odds 2019: Who Wi...,Who will be the Dallas Cowboys head coach next...,"Sports,Sports,Betting Odds,dallas-cowboys,NFL","Sports,Sports,Betting Odds,dallas-cowboys,NFL",Dallas Cowboys Head Coaching Odds 2019: Who Wi...,sports
...,...,...,...,...,...,...,...,...
163132,https://www.redandblack.com/sports/georgia-foo...,news,Georgia football signs 5-star outside lineback...,Mekhail Sherman signed his letter of intent to...,"mekhail sherman, linebacker, georgia, sport, b...","mekhail sherman, linebacker, georgia, sport, b...",Georgia football signs 5-star outside lineback...,sports
163221,https://6abc.com/sports/eagles-ready-to-take-o...,news,"Philadelphia Eagles ready to take on Cowboys, ...","""Embarrassing!"" That's the word Carson Wentz, ...","Philadelphia eagles, eagles, nfl, football, Da...",,"Philadelphia Eagles ready to take on Cowboys, ...",sports
163319,https://madison.com/opinion/letters/coach-gard...,news,Coach Gard must right sinking ship -- Hal Wiss...,What’s going on with the Wisconsin men’s baske...,", letter to editor, commentary, greg gard, bas...",", letter to editor, commentary, greg gard, bas...",Coach Gard must right sinking ship -- Hal Wiss...,sports
163328,https://qctimes.com/sports/high-school/basketb...,news,Central DeWitt remains No. 1 in Area Fab 5 | H...,AREA FAB 5,", northeast, central dewitt, game, sport, albu...",", northeast, central dewitt, game, sport, albu...",Central DeWitt remains No. 1 in Area Fab 5 | H...,sports


In [352]:
# Sampling for inspection
df_final[df_final.kw_label=='sports'].sample(10)

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
67187,https://www.wsls.com/sports/2019/12/18/introdu...,news,Introducing your 2019 Coach and Player of the ...,Local high school football greats were honored...,"Sports,High School,1st and 10,Football",,Introducing your 2019 Coach and Player of the ...,sports
69872,https://www.wacotrib.com/sports/college/baylor...,news,Defense key to Baylor men's No. 10 ranking | B...,Baylor has risen to No. 10 in the country behi...,"baylor, sport, basketball, bears show, mark vi...","baylor, sport, basketball, bears show, mark vi...",Defense key to Baylor men's No. 10 ranking | B...,sports
47807,https://www.heraldandnews.com/sports/arizona-b...,news,"Arizona, Bumgarner complete deal | Sports | he...",PHOENIX (AP) — The Arizona Diamondbacks have f...,"mlb baseball, arizona diamondbacks, madison bu...","mlb baseball, arizona diamondbacks, madison bu...","Arizona, Bumgarner complete deal | Sports | he...",sports
149257,https://www.onenewspage.com/video/20191213/125...,news,David Stern Hospitalized With Brain Hemorrhage...,David Stern Hospitalized With Brain Hemorrhage...,"David Stern brain hemorrhage,basketball,sports...",,David Stern Hospitalized With Brain Hemorrhage...,sports
16829,https://www.chroniclejournal.com/sports/nation...,news,"Aho, Staal score twice as the Hurricanes doubl...",WINNIPEG - Sebastian Aho and Jordan Staal each...,"sports, carolina hurricanes, sebastian aho, po...","sports, carolina hurricanes, sebastian aho, po...","Aho, Staal score twice as the Hurricanes doubl...",sports
129523,https://www.stwnewspress.com/sports/osu_sports...,news,ORANGE PRATTLE: Why I was part of 4% not to vo...,"As an Oklahoma State beat writer, I know full ...","jalen hurts, joe burrow, justin fields, sport,...","jalen hurts, joe burrow, justin fields, sport,...",ORANGE PRATTLE: Why I was part of 4% not to vo...,sports
47664,https://www.skysports.com/football/news/11670/...,sports,Arsenal hope Mikel Arteta will be head coach f...,Arsenal remain hopeful that Mikel Arteta will ...,"Sky, Sports, Football, Premier League, Premier...",,Arsenal hope Mikel Arteta will be head coach f...,sports
82183,https://www.hitc.com/en-gb/2019/12/18/do-repor...,news,Report: Southampton want Champions League pair...,Joakim Maehle and Marin Pongracic could be hea...,"saintsfc,cpfc,southampton fc,crystal palace fc...","saintsfc,cpfc,southampton fc,crystal palace fc...",Report: Southampton want Champions League pair...,sports
80101,https://www.forbes.com/sites/mikeozanian/2019/...,news,The Worldâs Most Valuable Sports Empires: Wo...,Sports owners with great brands have increasin...,"sports, nba, nfl, mlb, nhl, premier league, ml...","sports, nba, nfl, mlb, nhl, premier league, ml...",The Worldâs Most Valuable Sports Empires: Wo...,sports
17316,https://oklahoman.com/article/5650169/ou-at-cr...,news,OU at Creighton men's basketball: Foul trouble...,Oklahoma was down by eight points with 13:31 l...,"oklahoma city,okc,Sports, College, Ou, Basketb...","oklahoma city,okc,Sports, College, Ou, Basketb...",OU at Creighton men's basketball: Foul trouble...,sports


In [353]:
df_final.iloc[47622].text


"Cameroon-Israel Relations - Ambassador Granted Farewell Audience - allAfrica.com Countries All Countries Algeria Angola Benin Botswana Burkina Faso Burundi Cameroon Cape Verde Central African Republic Chad Comoros Congo-Brazzaville Congo-Kinshasa Cote d'Ivoire Djibouti Egypt Equatorial Guinea Eritrea Ethiopia Gabon Gambia Ghana Guinea Guinea Bissau Kenya Lesotho Liberia Libya Madagascar Malawi Mali Mauritania Mauritius Morocco Mozambique Namibia Niger Nigeria Rwanda Senegal Seychelles Sierra Leone Somalia South Africa South Sudan Sudan Swaziland São Tomé and Príncipe Tanzania Togo Tunisia Uganda Western Sahara Zambia Zimbabwe Africa-Wide Central Africa Central Africa Home Angola Burundi Cameroon Central African Republic Chad Congo-Brazzaville Congo-Kinshasa Equatorial Guinea Gabon Rwanda São Tomé and Príncipe East Africa East Africa Home Burundi Comoros Djibouti Eritrea Ethiopia Kenya Madagascar Mauritius Rwanda Seychelles Somalia South Sudan Sudan Tanzania Uganda North Africa North A

In [354]:
df_final.head()

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
0,https://chicago.cbslocal.com/2019/12/18/man-st...,sports,Man Stabbed After His Granddaughter And Three ...,A man's granddaughter is one of four teens acc...,"grandfather, granddaughter, teen, teens, home ...","grandfather, granddaughter, teen, teens, home ...",Man Stabbed After His Granddaughter And Three ...,
1,http://www.releasewire.com/press-releases/rele...,news,Commence Industrial CRM Achieves Sales Optimiz...,,,,Commence Industrial CRM Achieves Sales Optimiz...,
2,https://www.kark.com/news/national-news/1-dead...,news,"1 dead, 2 injured in Oregon shopping center st...",Multiple people were stabbed at the Murray Hil...,,,"1 dead, 2 injured in Oregon shopping center st...",
3,https://www.ii.co.uk/etfs/sg-wti-x2-daily-long...,financial_news,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...,Real-time share price updates and latest news ...,,,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...,
4,http://www.peicanada.com/island_deaths/frances...,news,Frances Salsman | Island Deaths | peicanada.com,"SALSMAN, Frances (Fran) Townsend At the Prince...","island_deaths, death_notices","island_deaths, death_notices",Frances Salsman | Island Deaths | peicanada.co...,


In [355]:
df_final['text'] = df_final.text.fillna('')

In [356]:
def make_printable(string):
    """
    Clean up Non-ASCII characters.

    Replace non-printable characters in a string.
    """
    no_print_trans_table = ''.join([chr(i) for i in range(128)] + [' '] * 128)
    
    try:
        trans = string.translate(no_print_trans_table)
    except AttributeError:
        trans = ""
    return trans

In [357]:
df_final['text'] = df_final["text"].apply(lambda text: make_printable(text))

In [358]:
df_final

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
0,https://chicago.cbslocal.com/2019/12/18/man-st...,sports,Man Stabbed After His Granddaughter And Three ...,A man's granddaughter is one of four teens acc...,"grandfather, granddaughter, teen, teens, home ...","grandfather, granddaughter, teen, teens, home ...",Man Stabbed After His Granddaughter And Three ...,
1,http://www.releasewire.com/press-releases/rele...,news,Commence Industrial CRM Achieves Sales Optimiz...,,,,Commence Industrial CRM Achieves Sales Optimiz...,
2,https://www.kark.com/news/national-news/1-dead...,news,"1 dead, 2 injured in Oregon shopping center st...",Multiple people were stabbed at the Murray Hil...,,,"1 dead, 2 injured in Oregon shopping center st...",
3,https://www.ii.co.uk/etfs/sg-wti-x2-daily-long...,financial_news,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...,Real-time share price updates and latest news ...,,,LSE:SG30 ETF Share Price | SG WTI X2 Daily Lon...,
4,http://www.peicanada.com/island_deaths/frances...,news,Frances Salsman | Island Deaths | peicanada.com,"SALSMAN, Frances (Fran) Townsend At the Prince...","island_deaths, death_notices","island_deaths, death_notices",Frances Salsman | Island Deaths | peicanada.co...,
...,...,...,...,...,...,...,...,...
163428,https://www.bakingbusiness.com/articles/49996-...,financial_news,Dairy Farmers of America merges ingredients bu...,KANSAS CITY â Dairy Farmers of America (D.F....,"baking business, Baking & Snack, baking and sn...",,Dairy Farmers of America merges ingredients bu...,
163429,https://www.fark.com/comments/10653287?utm_sou...,news,FARK.com: (10653287) You can now watch the vi...,"Humorous views on interesting, bizarre and amu...","Fark, Fark.com, Drew Curtis, News, Community N...",,FARK.com: (10653287) You can now watch the vid...,
163430,https://www.mygrandeprairienow.com/65144/local...,news,Local man hoping to make holidays special by h...,,,,Local man hoping to make holidays special by h...,
163431,https://www.thehollywoodgossip.com/slideshows/...,entertainment,90 Day Fiance Before The 90 Days: Trailer Teas...,90 Day Fiance: Before The 90 Days just release...,,,90 Day Fiance Before The 90 Days: Trailer Teas...,


In [359]:
df_final.text.sample(1).item()

  """Entry point for launching an IPython kernel.


'Lawsuit: Apple, Microsoft profit from child cobalt miners | KRQE News 13 Skip to content KRQE News 13 Albuquerque 21  Sponsored By Search Primary Menu Live/Video Video Center Live Broadcast KRQE Live CBSN Live Stream News Top Stories Local News Larry Barker On Special Assignment Politics – Government Elections Election Results Washington D.C. Bureau U.S. News World Entertainment Don’t Miss Weird Wildfires BorderReport.com Top Stories Company believes to have captured essence of New Mexico in candle As House convenes on impeachment, Trump declares disbelief Verizon reportedly experiencing nationwide outage Survey: Average American will be involved in 12 arguments during holiday travel Weather Radar VIDEO Full Forecast Closings & Delays Alerts Map Center Ski Conditions Winds Albuquerque Rio Rancho East Mountains/Edgewood Santa Fe/Northern Mtns Farmington/Northwest NM Durango/Southern CO Las Vegas Roswell Roads/Traffic Live Traffic Map & Cameras Road Weather Albuquerque I-40 Cameras Albu

In [360]:
df_final.kw_label.value_counts()

business_economy    1438
sports              1199
politics             479
world                395
entertainment        158
Name: kw_label, dtype: int64

In [368]:
df_final[df_final.kw_label=='sports'].sample(10)

Unnamed: 0,url,dom_cat,title,description,keywords,news_keywords,text,kw_label
72154,https://santamariatimes.com/sports/high-school...,news,Santa Ynez boys soccer hands Santa Maria its 1...,The Santa Ynez Pirates scored early Tuesday ni...,", local-sports, prep-sports, soccer, sports, s...",", local-sports, prep-sports, soccer, sports, s...",Santa Ynez boys soccer hands Santa Maria its 1...,sports
121816,https://wcfcourier.com/daniel-jackson/article_...,news,Daniel Jackson | | wcfcourier.com,Daniel Jackson,", daniel jackson, cyclone, sport, pass, score,...",", daniel jackson, cyclone, sport, pass, score,...",Daniel Jackson | | wcfcourier.com Facebook Twi...,sports
71815,https://www.hitc.com/en-gb/2019/12/18/great-im...,news,"'Great imagination, great feet': Jamie Redknap...",Liverpool's youngsters lost 5-0 at Aston Villa...,"liverpool fc,premier league,jamie redknapp,har...","liverpool fc,premier league,jamie redknapp,har...","'Great imagination, great feet': Jamie Redknap...",sports
71903,https://www.hitc.com/en-gb/2019/12/18/report-t...,news,Report: The January transfer budget Arsenal wi...,Arsenal are expected to announce the appointme...,"arsenal fc,afc,premier league,mikel arteta,sport","arsenal fc,afc,premier league,mikel arteta,sport",Report: The January transfer budget Arsenal wi...,sports
17499,https://elkodaily.com/sports/local/spartans-ta...,news,Spartans take 31st of 107 teams at RTOC | Loca...,Facing the ultimate competition at the Reno To...,", spring creek spartans, sports, wrestling, re...",", spring creek spartans, sports, wrestling, re...",Spartans take 31st of 107 teams at RTOC | Loca...,sports
51724,https://www.skysports.com/football/news/11095/...,sports,Family of England fan who died in Bulgaria sti...,The family of an England supporter who died in...,"News,Football,Sky,Sports,Sky Sports,sport,news",,Family of England fan who died in Bulgaria sti...,sports
88344,https://www.omaha.com/sports/scoreboard/thursd...,news,Thursday's Area Events | Scoreboard | omaha.com,BOYS HIGH SCHOOL BASKETBALL,"omaha, sport, basketball, malcolm, johnson cou...","omaha, sport, basketball, malcolm, johnson cou...",Thursday's Area Events | Scoreboard | omaha.co...,sports
67048,https://www.somdnews.com/enterprise/sports/new...,news,St. Mary's College of Maryland basketball team...,Women’s basketball wins third straight,"""st. marys college of maryland, mens basketbal...","""st. marys college of maryland, mens basketbal...",St. Mary's College of Maryland basketball team...,sports
120442,https://santamariatimes.com/jamarri-jackson-db...,news,Jamarri Jackson | DB | | santamariatimes.com,Signed,", tackle, jackson, sport, norcal db, san mateo...",", tackle, jackson, sport, norcal db, san mateo...",Jamarri Jackson | DB | | santamariatimes.com F...,sports
41929,https://www.hitc.com/en-gb/2019/12/18/should-l...,news,Should Leeds look at Tottenham's Juan Foyth?,The Tottenham Hotspur defender was linked with...,"leeds united fc,tottenham hotspur fc,premier l...","leeds united fc,tottenham hotspur fc,premier l...",Should Leeds look at Tottenham's Juan Foyth? S...,sports


In [374]:
from sklearn.model_selection import train_test_split
train, dev = train_test_split(df_final[~df_final.kw_label.isna()], test_size=0.1, random_state=42)

In [375]:
train.kw_label.value_counts()

business_economy    1300
sports              1085
politics             424
world                349
entertainment        144
Name: kw_label, dtype: int64

In [377]:
train = pd.DataFrame({'id': range(len(train.index)),
                          'label': train["kw_label"],
                          'alpha': ['a'] * train.shape[0],
                          'text': train["text"]
                          })
    
dev = pd.DataFrame({'id': range(len(dev.index)),
                    'label': dev['kw_label'],
                    'alpha': ['a'] * dev.shape[0],
                    'text': dev["text"]
                    })

In [378]:
file_path = Path.home() / 'Data' / 'cc_news' / 'model_input'
if not file_path.is_dir():
    Path.mkdir(file_path, parents=True, exist_ok=True)
    
train.to_csv(file_path / 'train.tsv', sep='\t', header=False, index=False)
dev.to_csv(file_path / 'dev.tsv', sep='\t', header=False, index=False)

In [383]:
unlabeled = df_final[df_final.kw_label.isna()]
unlabeled = pd.DataFrame({'id': range(len(df_final)),
                            'text': df_final['text']
                          })

In [384]:
# Save unlabeled documents
unlabeled.to_csv(file_path / 'unlabeled.tsv', sep='\t', header=False, index=False)

In [386]:
df_final.kw_label.unique().tolist()

[None, 'politics', 'sports', 'business_economy', 'world', 'entertainment']