1. Reads in the BBC topic dataset 
2. Represents its text as vectors using the TFIDF vectorizer
3. Represents the topic labels as numbers
4. Analyzes the number of texts available for each topic and the most frequent words for the 5. two most numerous topics
6. Trains a dummy classifier on the data

### Import the packages

In [1]:
import csv
import nltk
import numpy as np
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn import preprocessing
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
import math
import string

### Initialize global variables

In [2]:
# https://www.geeksforgeeks.org/snowball-stemmer-nlp/
stemmer = SnowballStemmer("english")

In [3]:
dataset = "bbc-text.csv"

In [4]:
stopwords = stopwords.words("english")

### read the csv file

In [5]:
def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',',
                            quotechar='"')
        data_read = [row for row in reader]
    return data_read

### tokenize and stem words

In [6]:
def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_tokens = [t for t in tokens if t not in
                       string.punctuation]
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

### stemmed stopwords

In [7]:
stopwords = [stemmer.stem(word) for word in stopwords]

### represent the input data as dictionary

In [8]:
def get_data(filename):
    data = read_in_csv(filename)
    data_dict = {}
    for row in data[1:]:
        category = row[0]
        text = row[1]
        if (category not in data_dict.keys()):
            data_dict[category] = []
        data_dict[category].append(text)
    return data_dict

In [9]:
data_dict = get_data(dataset)

In [10]:
for topic in data_dict.keys():
    print(topic, "\t", len(data_dict[topic]))

tech 	 401
business 	 510
sport 	 511
entertainment 	 386
politics 	 417


### get stats and use FreqDist to provide the top words

In [11]:
def get_stats(text, num_words=200):
    word_list = nltk.tokenize.word_tokenize(text)
    word_list = [word for word in word_list if word
                 not in stopwords and re.search(
                                    "[A-Za-z]", word)]
    freq_dist = FreqDist(word_list)
    print(freq_dist.most_common(num_words))
    return freq_dist

In [12]:
business_data = data_dict["business"]
business_string = " ".join(business_data)
get_stats(business_string)

[('said', 1680), ('its', 1100), ('us', 813), ('year', 637), ('mr', 600), ('would', 463), ('also', 440), ('market', 425), ('new', 416), ('company', 415), ('growth', 384), ('last', 365), ('firm', 362), ('economy', 359), ('government', 340), ('bank', 335), ('sales', 316), ('could', 311), ('economic', 310), ('oil', 294), ('shares', 265), ('however', 256), ('world', 252), ('may', 251), ('years', 247), ('prices', 246), ('one', 243), ('chief', 236), ('two', 231), ('china', 223), ('business', 218), ('companies', 212), ('analysts', 209), ('uk', 207), ('deal', 206), ('rise', 203), ('expected', 200), ('group', 199), ('financial', 197), ('yukos', 196), ('firms', 193), ('since', 183), ('dollar', 180), ('december', 173), ('country', 173), ('months', 170), ('people', 170), ('stock', 168), ('first', 165), ('president', 165), ('three', 164), ('still', 164), ('many', 163), ('time', 159), ('european', 159), ('rate', 159), ('state', 158), ('trade', 158), ('told', 155), ('investment', 153), ('demand', 151)

FreqDist({'said': 1680, 'its': 1100, 'us': 813, 'year': 637, 'mr': 600, 'would': 463, 'also': 440, 'market': 425, 'new': 416, 'company': 415, ...})

### create TFIDF vectorizer for the text

In [13]:
def create_vectorizer(text_list):
    tfidf_vectorizer = \
    TfidfVectorizer(max_df=0.90, max_features=200000,
                    min_df=0.05, stop_words='english',
                    use_idf=True,
                    tokenizer=tokenize_and_stem,
                    ngram_range=(1,3))
    tfidf_vectorizer.fit_transform(text_list)
    return tfidf_vectorizer

### split the train and test data 

In [14]:
def split_test_train(data, train_percent):
    train_test_border = \
    math.ceil(train_percent*len(data))
    train_data = data[0:train_test_border]
    test_data = data[train_test_border:]
    return (train_data, test_data)

In [15]:
sports_data = data_dict["sport"]

In [16]:
(business_train_data, business_test_data) = \
split_test_train(business_data, 0.8)
(sports_train_data, sports_test_data) = \
split_test_train(sports_data, 0.8)
train_data = business_train_data + sports_train_data
tfidf_vec = create_vectorizer(train_data)



### get labels

In [17]:
def get_labels(names):
    le = preprocessing.LabelEncoder()
    le.fit(names)
    return le

In [18]:
le = get_labels(["business", "sport"])

### create dataset to transform the text data into a numpy array

In [19]:
def create_dataset(vectorizer, data_dict, le):
    business_news = data_dict["business"]
    sports_news = data_dict["sport"]
    (sports_vectors, sports_labels) = create_data_matrix(sports_news, vectorizer, "sport", le)
    (business_vectors, business_labels) = create_data_matrix(business_news, vectorizer, "business", le)
    all_data_matrix = np.vstack((business_vectors, sports_vectors))
    labels = np.concatenate([business_labels, sports_labels])
    return (all_data_matrix, labels)

In [20]:
def create_data_matrix(input_data, vectorizer, label, le):
    vectors = vectorizer.transform(input_data).todense()
    labels = [label]*len(input_data)
    enc_labels = le.transform(labels)
    return (vectors, enc_labels)

### create two data dictionaries 

In [21]:
train_data_dict = {'business':business_train_data,
                   'sport':sports_train_data}
test_data_dict = {'business':business_test_data,
                  'sport':sports_test_data}
(X_train, y_train) = \
create_dataset(tfidf_vec, train_data_dict, le)
(X_test, y_test) = \
create_dataset(tfidf_vec, test_data_dict, le)

### create dummy classifier to establish a baseline

In [22]:
def predict_trivial(X_train, y_train, X_test, y_test, le):
    dummy_clf = DummyClassifier(strategy='uniform',
                                random_state=0)
    dummy_clf.fit(X_train, y_train)
    y_pred = dummy_clf.predict(X_test)
    print(dummy_clf.score(X_test, y_test))
    print(classification_report(y_test, y_pred,
          labels=le.transform(le.classes_),
          target_names=le.classes_))

In [23]:
predict_trivial(X_train, y_train, X_test, y_test, le)

0.44607843137254904
              precision    recall  f1-score   support

    business       0.45      0.44      0.44       102
       sport       0.45      0.45      0.45       102

    accuracy                           0.45       204
   macro avg       0.45      0.45      0.45       204
weighted avg       0.45      0.45      0.45       204

