# Organizing and Preparing Data from Raw Data

In [1]:
import os
import pandas as pd
import numpy as np

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

words = []
labels = []

categories = "dataset/20news-18828-v2/20news-18828/"
for category in os.listdir(categories):
    category_dir = f"{categories}/{category}"
    for filename in os.listdir(category_dir):
        category_filepath = f"{categories}/{category}/{filename}"
        if os.path.isfile(category_filepath):
            f = open(category_filepath, "rb")
            word = f.read() #read contents of each file
            words.append(word)
            labels.append(category)

raw_dataset = pd.DataFrame({'text': pd.Series(words,dtype="string"), 'categories': pd.Series(labels,dtype="string")})
raw_dataset.head()

Unnamed: 0,text,categories
0,"b""From: vdp@mayo.edu (Vinayak Dutt)\nSubject: ...",alt.atheism
1,b'From: I3150101@dbstu1.rz.tu-bs.de (Benedikt ...,alt.atheism
2,b'Subject: Re: Alleged Deathbed Conversions (w...,alt.atheism
3,b'From: livesey@solntze.wpd.sgi.com (Jon Lives...,alt.atheism
4,b'Subject: Re: islamic authority over women\nF...,alt.atheism


In [2]:
raw_dataset.to_csv('dataset/organized_dataset.csv', index=False, header=True)

## Splitting data to Train and Test sets

In [3]:
full_df = pd.read_csv("dataset/organized_dataset.csv")

In [4]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(full_df, 
                                     test_size=0.30, #Split dataset to 70 - 30
                                     stratify=full_df['categories'], #strategically splits the dataset based on the proportion of samples per class
                                     random_state=RANDOM_SEED)

In [5]:
train_df['categories'].value_counts()

rec.sport.hockey            699
soc.religion.christian      698
rec.sport.baseball          696
rec.motorcycles             696
sci.crypt                   694
rec.autos                   693
sci.med                     693
sci.space                   691
comp.os.ms-windows.misc     689
comp.sys.ibm.pc.hardware    687
sci.electronics             687
comp.windows.x              686
comp.graphics               681
misc.forsale                680
comp.sys.mac.hardware       673
talk.politics.mideast       658
talk.politics.guns          637
alt.atheism                 559
talk.politics.misc          542
talk.religion.misc          440
Name: categories, dtype: int64

In [6]:
test_df['categories'].value_counts()

rec.sport.hockey            300
soc.religion.christian      299
rec.sport.baseball          298
rec.motorcycles             298
sci.crypt                   297
rec.autos                   297
sci.med                     297
comp.os.ms-windows.misc     296
sci.space                   296
comp.sys.ibm.pc.hardware    295
comp.windows.x              294
sci.electronics             294
comp.graphics               292
misc.forsale                292
comp.sys.mac.hardware       288
talk.politics.mideast       282
talk.politics.guns          273
alt.atheism                 240
talk.politics.misc          233
talk.religion.misc          188
Name: categories, dtype: int64

In [7]:
train_df.to_csv("dataset/train.csv", index=False, header=True)

In [8]:
test_df.to_csv("dataset/test.csv", index=False, header=True)