In [None]:
import pandas as pd

# Load data

In [None]:
data_fpath = 'bikepgh_av_survey.csv' # fill the path to this file here
data = pd.read_csv(data_fpath, index_col=0)
print(data.shape)
print(data.columns)

# Choose one of the text fields

In [None]:
text_colnames = [
    'interaction_details',
    'positive_av_interaction',
    'negative_av_interaction',
    'other_av_regulations',
    'elaborate_bikepgh_position',
    'other_comments',
                ]

for colname in text_colnames:
    print(f'{colname}: {data[colname].count()} entries')

# Tokenize (split text into words)
This may seem trivial, but you'll want to detach punctuation from words, since "person" and "person," aren't very different. And what about contractions such as "I'm"? Will you want to lowercase everything or is there some distinction between "polish" and "Polish" you'd want to preserve?

You'll also want to think about "stopwords", function words such as "the" and "and", or "or" and "that". Counts for these words are often distracting to machine learning models, and they're often removed unless there may be important or meaningful variation in stopword usage.

In [None]:
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords

colname = # your selected column: 'interaction_details', 'other_av_regulations', 'other_comments', etc

# Filter to non-empty rows for that column
filtered_data = data[data[colname].map(lambda x: isinstance(x, str))].copy()
print(f"Filtered to {len(filtered_data)} entries")

# Get documents (each response will be a document)
docs = filtered_data[colname].tolist()

# Tokenize, remove stopwords, lowercase documents. Feel free to remove one of these steps and see what happens
tokenized_docs = [list(tokenize(remove_stopwords(doc.lower()))) for doc in docs]

print(len(tokenized_docs))
tokenized_docs[:3]

## How many words per entry? How many total words for each text column?
Now's a good time to use our tokenized documents to calculate some basic stats.

In [None]:
total_words = sum([len(doc) for doc in tokenized_docs])
avg_words = total_words/len(tokenized_docs)
print('{}:'.format(colname))
print('\ttotal words: {}'.format(total_words))
print('\tavg words: {}'.format(avg_words))
print()

# Extract features (words to numbers)
One of the simplest ways to get documents into numeric format for machine learning is to simply count each unique word and treat each document as collection of these counts. For example, "the dog barked loudly at the hat" would become {the: 2, dog: 1, barked: 1, loudly: 1, at: 1, hat: 1}. Each unique word in the vocabulary is usually given an ID. Because order information is lost, this is referred to as the "bag-of-words" model of documents.

In [None]:
from gensim.corpora.dictionary import Dictionary

dictionary = Dictionary(tokenized_docs)
print("Found {} unique words".format(len(dictionary.token2id)))

corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs] # turn each document into a bag-of-words count
corpus[1] # list of tuples (word_id, count)

# Run LDA
Now let's let LDA find topics. Here you'll want to vary the number of topics and compare results in the interpretation later. Start with 5 or 10 and go up to as much as you feel comfortable trying to interpret.

In [None]:
from gensim.models.ldamodel import LdaModel
lda = LdaModel(corpus, id2word=dictionary, num_topics=10)

# Interpretation
This is one of the tougher parts. You'll examine the words and documents given the highest probability for each topic and see if they make any sense (they might not). If they don't, go back and change the number of topics, change preprocessing (tokenization, etc), or throw up your hands and tell me how terrible topic modeling is :)

## Top documents/topic

In [None]:
word_topics = lda.show_topics()
word_topics # this will print in the format prob*"word"

## Top documents/topic

In [None]:
def get_top_docs(lda, docs, n_docs=5):
    document_topics = [lda.get_document_topics(bow) for bow in corpus]
    lines = []
    for topic_ind in range(lda.num_topics):
        for doc_ind, dist in sorted(zip(list(range(len(document_topics))), document_topics), reverse=True, key=lambda x: abs(dict(x[1]).get(topic_ind, 0.0)))[:n_docs]:
            topic_proportion = dict(dist).get(topic_ind, 0.0)
            lines.append([topic_ind, word_topics[topic_ind][1], doc_ind, docs[doc_ind], topic_proportion])

    return pd.DataFrame(lines, columns=['topic', 'topic_top_words', 'document_index', 'document', 'topic_proportion'])

In [None]:
pd.set_option('display.max_colwidth', -1)
get_top_docs(lda, docs, n_docs=5)

## See how distribution of other fields varies across topics
Here, you can "assign" documents to their highest-ranking topic and see how other fields vary across topics

In [None]:
document_topics = [lda.get_document_topics(bow) for bow in corpus]
id2index = dict(zip(filtered_data.index, range(len(filtered_data))))

# Assign top topics to docs
filtered_data['top_topic'] = filtered_data.index.map(lambda x: sorted(document_topics[id2index[x]], key=lambda x: x[1], reverse=True)[0][0])

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

topic =  # topic ID
compared_column =  # Column to compare with: try 'feel_safe_avs' or others (print all with data.columns)
topic_data = filtered_data[filtered_data['top_topic']==topic].copy()
topic_data[compared_column].value_counts().sort_index().plot.bar().set_title(compared_column)