# Data collection: BBC Dataset

Obtained from http://mlg.ucd.ie/datasets/bbc.html
- 2225 articles from BBC in five topical areas from 2004-2005
- business, entertainment, politics, sport, tech

In [28]:
data_dir = 'bbc-fulltext/bbc/'
topics = {'business':510 , 
          'entertainment':386, 
          'politics':417, 
          'sport':511, 
          'tech':401}

In [20]:
sum(topics.values())

2225

# TF-IDF

Counting words within a directory 
(Referred:https://www2.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html)

In [176]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from math import log
import string

In [243]:
stop = stopwords.words('english')
stop += ['said','also','u','would','one']

Tokenization (Lemmatizaiton and word count)

In [251]:
def preprocess(text):
    lowers = text.lower()
    no_punct = lowers.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(no_punct)
    filtered1 = [token for token in tokens if token not in stop]
    lem = [lmtzr.lemmatize(token) for token in filtered1]
    filtered2 = [token for token in lem if token not in stop]
    return filtered2

In [252]:
def countWord(dir_name):
    lmtzr = WordNetLemmatizer()
    count = Counter()
    for fname in os.listdir(dir_name):
        with open(dir_name+'/'+fname,'r') as f:
            words = preprocess(f.read())
            count += Counter(words)
            """
            file = f.read()
            lowers = file.lower()
            no_punct = lowers.translate(str.maketrans('','',string.punctuation))
            tokens = word_tokenize(no_punct)
            filtered1 = [token for token in tokens if token not in stop]
            lem = [lmtzr.lemmatize(token) for token in filtered1]
            filtered2 = [token for token in lem if token not in stop]
            count += Counter(filtered2)
            """
    return count

In [253]:
counts = {topic:countWord(data_dir+topic) for topic in topics.keys()}

In [222]:
def compute_tfidf(counter_list):
    # counter_list is a dictionary of string (topic name) and counter (counts) pair
    return_list = {}
    all_frequency = Counter()
    for counter in counter_list.values():
        all_frequency = all_frequency + counter
    # compute tfidf for each topic
    for topic in counter_list:
        counter = counter_list[topic]
        except_frequency = all_frequency - counter
        
        #compute tf
        tf = {}
        tf_N = sum(counter.values(),0.0)
        for key in counter:
            tf[key] = counter[key] / tf_N
        
        #compute idf
        idf = {}
        idf_N = sum(except_frequency.values(),0.0)
        for key in except_frequency:
            idf[key] = log(idf_N / except_frequency[key])
        
        tfidf = {}
        for word in tf:
            if word not in idf:
                # the word only appears in this topic
                continue
                tfidf[word] = 1.0
            else:
                tfidf[word] = tf[word] * idf[word]

        return_list[topic] = tfidf
        
    return return_list

In [268]:
tfidf = compute_tfidf(counts)

In [247]:
import operator
for topic in topics.keys():
    print(topic,":")
    sorted_list = sorted(tfidf[topic].items(), key=operator.itemgetter(1),reverse=True)
    toptwenty = sorted_list[:20]
    print(' '.join([s[0] for s in toptwenty]),'\n')

business :
year company firm market bank growth economy price oil share mr sale economic rate new analyst profit government last 2004 

entertainment :
film award best year music show star actor oscar band album song singer chart new director comedy nomination number first 

politics :
mr labour party blair election government minister people lord brown mp say plan howard tax prime secretary chancellor conservative public 

sport :
game england win player cup match champion first coach side rugby team world injury ireland club back time year last 

tech :
people technology mobile user phone game software computer service digital net site pc new use online network could device gadget 



These TF-IDF scores seem to represent each topic very well.

Now, I will sum up the TF-IDF scores from training data to create a new metric

In [256]:
def sum_tfidf(text,tfidf):
    words = preprocess(text)
    metrics = {key: 0 for key in tfidf.keys()}
    for word in words:
        for topic in tfidf.keys():
            if word in tfidf[topic]:
                metrics[topic] += tfidf[topic][word]
    return metrics

In [266]:
def get_metric(topic,tfidf):
    topic_dir = data_dir + topic + "/"
    result = {key: 0 for key in tfidf.keys()}
    for fname in os.listdir(topic_dir):
        with open(topic_dir+fname,'r') as f:
            r = f.read()
            metrics = sum_tfidf(r,tfidf)
            for key in metrics:
                result[key] += metrics[key]
    denom = sum(result.values())
    stnd_result = {key: round(result[key]/denom,2) for key in result}
    return stnd_result

In [267]:
for topic in topics.keys():
    print(topic)
    print(get_metric(topic,tfidf))

business
{'business': 0.35, 'entertainment': 0.14, 'politics': 0.2, 'sport': 0.13, 'tech': 0.18}
entertainment
{'business': 0.14, 'entertainment': 0.38, 'politics': 0.15, 'sport': 0.15, 'tech': 0.17}
politics
{'business': 0.18, 'entertainment': 0.13, 'politics': 0.41, 'sport': 0.12, 'tech': 0.16}
sport
{'business': 0.14, 'entertainment': 0.17, 'politics': 0.15, 'sport': 0.38, 'tech': 0.16}
tech
{'business': 0.17, 'entertainment': 0.16, 'politics': 0.17, 'sport': 0.14, 'tech': 0.36}


The result is in a good shape. The metrics for the correct classes are always greater than 0.35 and incorrect classes are always less than 0.2.

# CNN

# LSTM

# Evaluation