# Coding Tutorial Week 8

In this tutorial, we look at ways to combine `spaCy`, `regex`, `pandas`, `matplotlib` and `seaborn` to analyse the text column of the BES data.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
from collections import Counter
from sklearn.cluster import AgglomerativeClustering

bes_df = pd.read_feather("../Week2/data/bes_data_subset_week2.feather")

We now import spacy and our language model.

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_md") # if this doesn't work for you
# open ipython terminal
# >>> import spacy
# >>> nlp = spacy.load("en_core_web_sm")
# >>> nlp._path
# PosixPath('<COPY THIS>')

To get an idea of what spacy can do, let's use it on one of the short responses.

In [None]:
doc = nlp(bes_df.loc[1216, 'a01'])
doc

In [None]:
doc.print_tree()

In [None]:
[token.pos_ for token in doc]

In [None]:
def view_spacy_data(doc):
    "View various aspects of the language model."
    data = []
    for token in doc:
        data.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                     token.shape_, token.is_alpha, token.is_stop])
    columns = ['text', 'lemma_', 'pos_', 'tag_',
               'dep_', 'shape_', 'is_alpha', 'is_stop']
    df = pd.DataFrame(data, columns=columns)
    return df

In [None]:
view_spacy_data(doc)

Let's do a bit of pre-processing before we apply the language model to the rest of the answers.

- NA removal
- lowercase everything
- remove consecutive spaces

In [None]:
bes_df['a01'].value_counts().head(20) # Looks like '-1' is a na value
                                      # Let's drop all rows that are na here

In [None]:
df = bes_df.loc[bes_df['a01']!='-1', :].reset_index(drop=True)

In [None]:
df['a01'].apply(lambda x: type(x)).unique()

In [None]:
df.loc[:, 'a01'] = df['a01'].str.lower().str.replace(re.compile(r"\s{2,}"), " ")

In [None]:
df['nlp'] = df['a01'].apply(lambda x: nlp(x))

In [None]:
def get_nouns(doc):
    nouns = [token.lemma_ for token in doc if
             token.pos_ in ('PROPN', 'NOUN') and
             token.is_stop==False]
    return nouns

In [None]:
df['nouns'] = df['nlp'].apply(get_nouns)

In [None]:
noun_frequencies = pd.Series(Counter(df['nouns'].sum())).sort_values(ascending=False)

In [None]:
f, ax = plt.subplots(1,1, figsize=(15, 9))

ax.set_title("Top 50 Most Common Nouns in Item a01")
sns.barplot(noun_frequencies.head(50).index, noun_frequencies.head(50), ax=ax)
ax.xaxis.set_ticklabels(ax.xaxis.get_ticklabels(), rotation=-90)
None

# Hierarchical Cosine Distance Clustering

This model will take more time than we have in class to evaluate, but here's a bit of code that can show you how to conduct cosine clustering on your document vectors.

In [None]:
df['vector'] = df['nlp'].apply(lambda x: x.vector)
df = df.loc[df['vector'].apply(lambda x: np.any(x)), :] # Dropping zero vectors

In [None]:
doc_vectors = np.vstack(df['vector'].values)

In [None]:
df['vector'][0].shape

In [None]:
df['vector'].apply(lambda x: type(x))

In [None]:
cosine_cluster = AgglomerativeClustering(n_clusters=30, affinity="cosine", linkage="single")

In [None]:
cosine_cluster.fit(doc_vectors)

In [None]:
df['cos_labs'] = cosine_cluster.labels_
df.loc[:, 'cos_labs'] = df['cos_labs'].astype(pd.CategoricalDtype())

In [None]:
df['cos_labs'].value_counts()

In [None]:
for label in range(10):
    n_samp = min(3, sum(df['cos_labs']==label))
    sample = df.loc[df['cos_labs']==label, 'a01'].sample(n_samp)
    print("###### CLUSTER "+str(label)+" ######")
    for item in sample.iteritems():
        print(item[1])
    print("\n")

Obviously there's a lot of work to be done here.