# Data Preparation
In this notebook, we prepare the data for analysis, and save it as CSV files.

In [2]:
from sklearn.datasets import fetch_20newsgroups

types_to_remove = ('headers', 'footers', 'quotes')
newsgroups_categories = ['rec.sport.baseball', 'rec.sport.hockey', 'talk.politics.guns', 'talk.politics.mideast']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=newsgroups_categories,
                                      remove=types_to_remove)

newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=types_to_remove,
                                     categories=newsgroups_categories)

In [15]:
import pandas as pd
df = pd.DataFrame({'data': newsgroups_train.data, 
                   'target': newsgroups_train.target,
                  'target_name': [newsgroups_train.target_names[x] for x in newsgroups_train.target]})

In [16]:
df_test = pd.DataFrame({'data': newsgroups_test.data, 
                   'target': newsgroups_test.target,
                  'target_name': [newsgroups_test.target_names[x] for x in newsgroups_test.target]})

In [17]:
df = pd.concat([df, df_test])

In [18]:
df.shape

(3843, 3)

In [19]:
def preprocess_text(txt):
    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('?', ' ?').replace('.', ' .').replace(',', ' ,')
    txt = txt.lower().strip()
    txt = txt.split(' ')
    txt = " ".join([w for w in txt if w!=''])
    return txt

In [20]:
df['data_processed'] = df['data'].apply(preprocess_text)

In [21]:
df['num_chars'] = df['data_processed'].apply(len)

In [22]:
df = df[ df['num_chars']>0 ]

In [23]:
print(df.iloc[0]['data'])
print(df.iloc[0]['data_processed'])

Does anyone have the scoop on Scot Erickson?  How long is he going to be
out for?


does anyone have the scoop on scot erickson ? how long is he going to be out for ?


In [23]:
df.to_csv('data/20newsgroups.csv', index=False)

In [25]:
df = df.reset_index().rename({'data_processed': 'text', 'target': 'label'}, axis=1)[['index', 'text', 'label']]
df_train = df.sample(frac=0.7, random_state=0)
df_test = df.drop(df_train.index)
df_val = df_test.sample(frac=0.5, random_state=0)
df_test = df_test.drop(df_val.index)

In [26]:
df_test.head()

Unnamed: 0,index,text,label
0,0,does anyone have the scoop on scot erickson ? ...,0
12,12,: it is meaningless to compare one player's pl...,1
21,21,do you have a terminal cold ? karabag is 'turk...,3
26,26,"not provable . it's about as ""provable"" as the...",2
62,62,nhl results for games played 4/05/93 . -------...,1


In [27]:
df_train.to_csv('data/train.csv', index=False)
df_val.to_csv('data/val.csv', index=False)
df_test.to_csv('data/test.csv', index=False)

In [32]:
pd.Series(df['label'].unique()).to_csv('data/labels.csv', index=False)

  """Entry point for launching an IPython kernel.
