### Import libraries

In [1]:
import os
import pandas as pd
import re
import string

### Custom import functions

In [2]:
# scan root folder for filenames folder names (categories)
def get_filepaths_and_categories(root_folder):
    filepaths_and_categories = []
    for root, dirs, files in os.walk(root_folder):
        for file_name in files:
            directory = root[root.rfind('\\')+1 :]
            path_to_file = root + '/' + file_name
            filepaths_and_categories.append({'path': path_to_file, 
                                             'category': directory})
    return filepaths_and_categories

In [3]:
# import data from files and remove unnecessary headers
def import_data(filepaths_and_categories):
    texts = []
    categories = []

    for item in filepaths_and_categories:
        f = open(item['path'], mode='r', encoding="utf8", errors='ignore')
        lines = f.read().splitlines()

        def filters(line):
            if line == '': return False
            if re.match('(^[A-Z]).*(: )', line): return False
            if re.match('^(In article <)', line): return False
            else: return True

        lines_filtered = filter(filters, lines)

        texts.append(' '.join(lines_filtered))
        categories.append(item['category'])
    
    return {'corpus': texts, 'category': categories}

### Import data

In [4]:
#apply custom functions to import train data and convert to pandas DF
root_folder_train = 'data/20news-bydate-train'
train_files_and_categories = get_filepaths_and_categories(root_folder_train)
train_data = import_data(train_files_and_categories)

train_df = pd.DataFrame(train_data)

In [5]:
#...test data...
root_folder_test = 'data/20news-bydate-test'
test_files_and_categories = get_filepaths_and_categories(root_folder_test)
test_data = import_data(test_files_and_categories)

test_df = pd.DataFrame(test_data)

### Review data

In [6]:
#train data
train_df.head()

Unnamed: 0,corpus,category
0,|> CarolinaFan@uiuc (cka52397@uxa.cso.uiuc.edu...,Automotive
1,">Road and Track (2/88) BMW325is 0-60 7.5s,...",Automotive
2,THANKS TO ALL OF YOU WHO RESPONDED TO MY POSTI...,Automotive
3,The subject says it all. My 1984 Chev S10 Pick...,Automotive
4,wolfson@regatta.sps.mot.com (Stephen Wolfson) ...,Automotive


In [7]:
train_df.groupby(['category']).agg(['count'])

Unnamed: 0_level_0,corpus
Unnamed: 0_level_1,count
category,Unnamed: 1_level_2
Automotive,1192
Computers,2936
Politics,1575
Religion,1456
Science,2373
Sports,1197


In [8]:
train_df.count()

corpus      10729
category    10729
dtype: int64

In [9]:
#test data
test_df.head()

Unnamed: 0,corpus,category
0,Corp. The opinions expressed are...,Automotive
1,tobias@convex.com (Allen Tobias) writes: >Bett...,Automotive
2,It is actually simple in principle. Porous ads...,Automotive
3,"I don't know about where you are, but here in ...",Automotive
4,> >In article <66758@mimsy.umd.edu> davew@cs.u...,Automotive


In [10]:
test_df.groupby(['category']).agg(['count'])

Unnamed: 0_level_0,corpus
Unnamed: 0_level_1,count
category,Unnamed: 1_level_2
Automotive,794
Computers,1955
Politics,1050
Religion,968
Science,1579
Sports,796


In [11]:
test_df.count()

corpus      7142
category    7142
dtype: int64

In [12]:
# review random message from train dataset
train_df.at[8180, 'corpus']

'Article 10886 of alt.radio.scanner: >Newsgroups: alt.radio.scanner Hello. I have a Realistic PRO-2024 scanner for sale.Here is a small desc ription: 60 programible chanels fully detailed backlighted digital display headphone jack antenna jack removable telescoping antenna auto search coverage: 30-50mHz 118-174mHz 380-512mHz It originally cost $200, but I will sell for $150. Thank You. --         / /     Buchanan in `96!       / /      Fear the goverment that fears your guns.   \\ \\/ /       Without the 2nd amendment, we cannot guarantee ou    \\/ /        r freedoms.           aj008@cleveland.freenet.edu --         / /     Buchanan in `96!       / /      Fear the goverment that fears your guns.   \\ \\/ /       Without the 2nd amendment, we cannot guarantee ou    \\/ /        r freedoms.           aj008@cleveland.freenet.edu'

### Data cleaning

In [13]:
# custom function for text cleaning
def clean_text(text):
    text = re.sub('(Subject: |Re: )', '', text) #1
    text = text.lower() #2
    # text = re.sub(r'\b[a-zA-Z]\b', "", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('(\t)', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = ' '.join(text.split())
    return text

In [14]:
# apply cleaning function
train_df['corpus'] = train_df['corpus'].apply(lambda x: clean_text(x))
test_df['corpus'] = test_df['corpus'].apply(lambda x: clean_text(x))

In [15]:
#Review message after cleaning
train_df.at[8180, 'corpus']

'article of altradioscanner newsgroups altradioscanner hello i have a realistic scanner for salehere is a small desc ription programible chanels fully detailed backlighted digital display headphone jack antenna jack removable telescoping antenna auto search coverage it originally cost but i will sell for thank you buchanan in fear the goverment that fears your guns without the amendment we cannot guarantee ou r freedoms buchanan in fear the goverment that fears your guns without the amendment we cannot guarantee ou r freedoms'

### Remove stopwords

In [16]:
# load libraries and stopwords disctionary
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [17]:
# custom function to remove stopwords
def remove_stopwords(corpus):
    text = corpus.split()
    text = [word for word in text if not word in set(stopwords)]
    text = ' '.join(text)
    return text

In [18]:
# apply remove stopwords function
train_df['corpus'] = train_df['corpus'].apply(lambda x: remove_stopwords(x))
test_df['corpus'] = test_df['corpus'].apply(lambda x: remove_stopwords(x))

In [19]:
# review message after removing stopwords
train_df.at[8180, 'corpus']

'article altradioscanner newsgroups altradioscanner hello realistic scanner salehere small desc ription programible chanels fully detailed backlighted digital display headphone jack antenna jack removable telescoping antenna auto search coverage originally cost sell thank buchanan fear goverment fears guns without amendment cannot guarantee ou r freedoms buchanan fear goverment fears guns without amendment cannot guarantee ou r freedoms'

### Lemmatization

In [20]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [21]:
# custom lemmatize function
def lemmatize(corpus):
    words = corpus.split()
    text = []
    for word in words:
        text.append(lemmatizer.lemmatize(word))
    text = ' '.join(text)
    return text

In [22]:
# apply lemmatize function
train_df['corpus'] = train_df['corpus'].apply(lambda x: lemmatize(x))
test_df['corpus'] = test_df['corpus'].apply(lambda x: lemmatize(x))

In [23]:
# review message after lemmatization
train_df.at[8180, 'corpus']

'article altradioscanner newsgroups altradioscanner hello realistic scanner salehere small desc ription programible chanels fully detailed backlighted digital display headphone jack antenna jack removable telescoping antenna auto search coverage originally cost sell thank buchanan fear goverment fear gun without amendment cannot guarantee ou r freedom buchanan fear goverment fear gun without amendment cannot guarantee ou r freedom'

### Save train_df and test_df variables to pickles

In [24]:
train_df.to_pickle('pickles/train_df.pkl')
test_df.to_pickle('pickles/test_df.pkl')