# Advanced Topic Modeling: Part 2

In [1]:
from pathlib import Path

import dariah
import cophi

## Preprocessing a corpus

In [2]:
directory = Path("data", "british-fiction-corpus")

In [None]:
corpus, metadata = cophi.corpus(directory,
                                lemma=True,
                                pos=["NN"],
                                language="en",
                                metadata=True)

In [None]:
metadata.iloc[:5]

### Extending metadata

In [None]:
metadata["year"] = [1868, 1853, 1860, 1799, 1742, 1749, 1844, 1850, 1876, 1848]
metadata.iloc[:5]

In [None]:
dtm = corpus.drop(corpus.dtm, corpus.hapax)

In [None]:
dtm = corpus.map_metadata(data=dtm,
                          metadata=metadata,
                          uuid="uuid",
                          fields=["year", "title"],
                          sep="_").fillna(0).astype(int)
dtm.iloc[:5,:10]

## Training a model

In [None]:
model = dariah.core.LDA(num_topics=10,
                        num_iterations=1000,
                        mallet="mallet-2.0.8/bin/mallet")
model.fit(dtm)

In [None]:
model.topics.iloc[:, :5]

## Visualizing a model

In [None]:
vis = dariah.core.Vis(model)

In [None]:
%matplotlib inline

vis.topic_document()