# Data Analysis
​
Before jumping into the task of classifying documents into classes, we should always look at the underlying data. We will convert the data into a pandas DataFrame to look at the data easily.

The main fields are:
- `document_id`: The id of the document.
- `content`: The raw textual content of the document.
- `labels`: The labels of the document as a list of strings.
- `train?`: Whether the document is in the training set. This is known by a prefix in the id of the document, which is defined by the `reuters.fileids()` method.


In [3]:
from nltk.corpus import reuters
import pandas as pd

def dataset_to_df():
    contents = []
    labels = []
    is_train = []

    document_ids = reuters.fileids()

    # Collect data for each document
    for doc_id in document_ids:
        contents.append(reuters.raw(doc_id))
        labels.append(reuters.categories(doc_id))
        is_train.append(doc_id.startswith("train"))

    # Create the DataFrame
    df_data = pd.DataFrame({
        'document_id': document_ids,
        'content': contents,
        'labels': labels,
        'train?': is_train
    })
    return df_data

def print_document(doc_id):
    print(
        f"""Doc ID: {doc_id}""" \
        "\n---------------------" \
        f"""\nLABELS: {reuters.categories(doc_id)}""" \
        "\n---------------------" \
        "\nCONTENT:" \
        f"""\n{reuters.raw(doc_id)}""" \
        "---------------------")

In [None]:
df_data = dataset_to_df()

# How doocuments look like?
print_document('training/9865')

df_data.head(5)

In [None]:
# Basic Stats
print(f"Total documents: {len(df_data)}")
print(f"Total train documents: {len(df_data[df_data['train?'] == True])}")
print(f"Total test documents: {len(df_data[df_data['train?'] == False])}")
print(f"Total labels: {len(df_data['labels'].explode().unique())}")


## Categories information

In [None]:
from operator import itemgetter
import pandas as pd

# Category distribution
print("\nCategory frequencies:")
category_counts = df_data['labels'].explode().value_counts()
df_category_distribution = category_counts.to_frame(name='documents')
sorted_data = df_category_distribution.sort_values(by='documents', ascending=False)
sorted_data.plot()