In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import sys

from utils import preprocess

In [2]:
df = pd.read_json("../DATA/News_Category_Dataset_v2.json", lines=True)
df.head(5)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [3]:
df = df[['category', 'headline']]
df.head(5)

Unnamed: 0,category,headline
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [4]:
def group_categories(group_list, group_name):
    
    for group in group_list:
        df.loc[df['category'] == group, 'category'] = group_name
        
    return df


df = group_categories(['WELLNESS', 'HEALTHY LIVING','HOME & LIVING','STYLE & BEAUTY' ,'STYLE'], 'LIFESTYLE AND WELLNESS')
df = group_categories([ 'PARENTING', 'PARENTS' ,'EDUCATION' ,'COLLEGE'], 'PARENTING AND EDUCATION')
df = group_categories(['SPORTS','ENTERTAINMENT' , 'COMEDY','WEIRD NEWS','ARTS'],  'SPORTS AND ENTERTAINMENT')
df = group_categories(['TRAVEL', 'ARTS & CULTURE','CULTURE & ARTS','FOOD & DRINK', 'TASTE'], 'TRAVEL-TOURISM & ART-CULTURE')
df = group_categories( ['WOMEN','QUEER VOICES', 'LATINO VOICES', 'BLACK VOICES'], 'EMPOWERED VOICES')
df = group_categories(['BUSINESS', 'MONEY'], 'BUSINESS & MONEY')
df = group_categories( ['THE WORLDPOST' , 'WORLDPOST' , 'WORLD NEWS'], 'WORLDNEWS')
df = group_categories(['ENVIRONMENT' ,'GREEN'], 'NATURE')
df = group_categories(['SCIENCE', 'TECH'], 'SCIENCE & TECH')
df = group_categories(['FIFTY' , 'IMPACT' ,'GOOD NEWS','CRIME'] , 'GENERAL')
df = group_categories(['WEDDINGS', 'DIVORCE',  'RELIGION','MEDIA'], 'MISC')

df['category'].nunique()

12

In [5]:
encodings = {v:i for i, v in enumerate(df['category'].unique())}
encodings

{'GENERAL': 0,
 'SPORTS AND ENTERTAINMENT': 1,
 'WORLDNEWS': 2,
 'POLITICS': 3,
 'EMPOWERED VOICES': 4,
 'BUSINESS & MONEY': 5,
 'TRAVEL-TOURISM & ART-CULTURE': 6,
 'MISC': 7,
 'SCIENCE & TECH': 8,
 'PARENTING AND EDUCATION': 9,
 'LIFESTYLE AND WELLNESS': 10,
 'NATURE': 11}

In [6]:
df['target'] = df['category'].apply(lambda x: encodings[x])
df.head(5)

Unnamed: 0,category,headline,target
0,GENERAL,There Were 2 Mass Shootings In Texas Last Week...,0
1,SPORTS AND ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,1
2,SPORTS AND ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,1
3,SPORTS AND ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,1
4,SPORTS AND ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,1


In [7]:
df.duplicated().sum()

1425

In [8]:
df.drop_duplicates(keep='last', inplace=True)

In [9]:
df.duplicated(subset=['headline']).sum()

84

In [10]:
df.drop_duplicates(subset=['headline'], keep='last', inplace=True)

In [11]:
print(len(df[df['headline'] == '']))

1


In [12]:
df.loc[df['headline'] == '', 'headline'] = np.nan
df.dropna(subset=['headline'], inplace=True)
print(len(df[df['headline'] == '']))

0


In [13]:
df['text'] = df['headline'].apply(lambda x: preprocess(x))
df.head(5)

Unnamed: 0,category,headline,target,text
0,GENERAL,There Were 2 Mass Shootings In Texas Last Week...,0,there were mass shootings in texas last week b...
1,SPORTS AND ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,1,will smith joins diplo and nicky jam for the w...
2,SPORTS AND ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,1,hugh grant marries for the first time at age
3,SPORTS AND ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,1,jim carrey blasts castrato adam schiff and dem...
4,SPORTS AND ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,1,julianna margulies uses donald trump poop bags...


In [14]:
df.tail(5)

Unnamed: 0,category,headline,target,text
200848,SCIENCE & TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,8,rim ceo thorsten heins significant plans for b...
200849,SPORTS AND ENTERTAINMENT,Maria Sharapova Stunned By Victoria Azarenka I...,1,maria sharapova stunned by victoria azarenka i...
200850,SPORTS AND ENTERTAINMENT,"Giants Over Patriots, Jets Over Colts Among M...",1,giants over patriots jets over colts among mos...
200851,SPORTS AND ENTERTAINMENT,Aldon Smith Arrested: 49ers Linebacker Busted ...,1,aldon smith arrested ers linebacker busted for...
200852,SPORTS AND ENTERTAINMENT,Dwight Howard Rips Teammates After Magic Loss ...,1,dwight howard rips teammates after magic loss ...


In [15]:
df.shape

(199343, 4)

In [16]:
df.drop(['category', 'headline'], inplace=True, axis=1)

print(df.shape)

(199343, 2)


In [17]:
df_train , df_test = train_test_split(df, test_size=0.02, stratify=df['target'], random_state=42)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)


print(f'Shape of Train data : {df_train.shape}')
print(f'Shape of Validation data : {df_val.shape}')
print(f'Shape of Test data : {df_test.shape}')

Shape of Train data : (195356, 2)
Shape of Validation data : (1993, 2)
Shape of Test data : (1994, 2)


In [18]:
if not os.path.exists("../DATA/News/"):
    os.mkdir("../DATA/News/")

In [19]:
df_train.to_csv("../DATA/News/train.csv", index=False)
df_val.to_csv("../DATA/News/valid.csv", index=False)
df_test.to_csv("../DATA/News/test.csv", index=False)



df_encode = pd.DataFrame({"classes":list(encodings.keys()), "index":list(encodings.values())})
df_encode.to_csv("../DATA/News/class_list.csv", index=False)