# Automatic Topic Labelling

In [85]:
%matplotlib inline

import os
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [92]:
def load_data(bbc_dir = './bbc/'):
    ''' read bbc data and return a list of tuples.  The tuple contains two
    strings. 1. the title of the news article and 2. category of news.'''
    all_files = [ (file, folder) for folder in os.listdir(bbc_dir) \
             if os.path.isdir(os.path.join(bbc_dir, folder)) \
             for file in os.listdir(os.path.join(bbc_dir, folder))]
    data = []
    for (file, category) in all_files:
        with open(os.path.join(bbc_dir + category, file), 'r') as f:
            try:
                data.append([f.readline()[:-1], category])
            except UnicodeDecodeError:
                pass
    length = max_length([record[0] for record in data])
    print(length)
    
    return data

In [106]:
def create_features(data):
    df = pd.DataFrame.from_records(data, columns=['headline', 'category'])
    df['category'] = pd.Categorical(df['category'])
    df['category_id'] = df.category.cat.codes
    category_map = dict(enumerate(df['category'].cat.categories))
    length = df.headline.map(len).max()
    df.headline = df.headline.str.pad(length, side='right', fillchar=' ')
    data_x = df[['headline']].values
    data_y = df[['category_id']].values
    return data_x, data_y, category_map

In [105]:
data = load_data()
data_x, data_y, category_map = create_features(data)
print(category_map)
print(data_x)
print(data_y)

52
52
{0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}
[['Musicians to tackle US red tape                     ']
 ["U2's desire to be number one                        "]
 ['Rocker Doherty in on-stage fight                    ']
 ...
 ['Nintendo adds media playing to DS                   ']
 ['Fast moving phone viruses appear                    ']
 ["Hacker threat to Apple's iTunes                     "]]
[[1]
 [1]
 [1]
 ...
 [4]
 [4]
 [4]]
