# Loading and processing survey data

- Load and clean up the data
- Visualise using embeddings
- Categorise using GPT-4

In [None]:
import pandas as pd
import openai

import sentence_transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

import umap
import hdbscan
import altair as alt


## Load the data

In [None]:
# Path to data
FILEPATH = "data/KK Copy of test data for DS on options issue map.xlsx"
SHEETS = [
    "g&p issues",
    "g&p interventions",
    "health issues",
    "health interventions",
    "inequality issues",
    "inequality interventions",
]

issue_cols = [f"Q_issue_{i}" for i in range(1, 11)]
other_cols = [f"q_other_{i}" for i in range(1, 12)]
intervention_cols = [f"q_intervention_{i}" for i in range(1, 11)]

In [None]:
def process_table(data_df: pd.DataFrame, sheet_name: str) -> pd.DataFrame:
    """Process on table of survey data"""
    if 'issues' in sheet_name:
        cols = issue_cols+other_cols
    else:
        cols = intervention_cols
    return (
        data_df
        .melt(value_vars=cols)
        .rename(columns={"variable": "question"})
        .assign(data_type=sheet_name)
        .assign(policy_area=lambda x: x.data_type.str.split(" ").str[0])
        .dropna(subset=['value'])
        .query("value != '-'")
    )

def load_and_process_survey() -> pd.DataFrame:
    """Load and process all survey data"""
    dfs = []
    for sheet_name in SHEETS:
        data_df = pd.read_excel(FILEPATH, sheet_name)
        dfs.append(process_table(data_df, sheet_name))
    return pd.concat(dfs, ignore_index=True)

In [None]:
survey_df = load_and_process_survey()

In [None]:
survey_df

## Embed and visualise

In [None]:
vectors = model.encode(survey_df.value.tolist(), show_progress_bar=True)

In [None]:
# use umap to reduce dimensionality
umap_embeddings = umap.UMAP(
    n_neighbors=15,
    n_components=25,
).fit_transform(vectors)

In [None]:
# use hdbscan to cluster, and assing all points to a cluster
cluster = hdbscan.HDBSCAN(
    min_cluster_size=10,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True,
).fit(umap_embeddings)

In [None]:
# use umap to reduce to 2-d
umap_embeddings_2d = umap.UMAP(
    n_neighbors=15,
    n_components=2,
).fit_transform(vectors)

In [None]:
# use altair to plot the clusters
survey_viz_df = (
    survey_df
    .assign(cluster=cluster.labels_)
    .assign(x=umap_embeddings_2d[:, 0])
    .assign(y=umap_embeddings_2d[:, 1])    
)

fig = (
    alt.Chart(survey_viz_df)
    .mark_circle()
    .encode(
        x='x',
        y='y',
        color='cluster:N',
        tooltip=['value', 'cluster'],
    )
    .interactive()
)

fig

In [None]:
# use tf-idf to get top words from each cluster
# first, join up all values in each cluster
cluster_df = (
    survey_viz_df
    .groupby('cluster')
    .agg({'value': ' '.join})
    .reset_index()
)

In [None]:
# # then, get top words for each cluster
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(cluster_df.value.tolist())

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Create a dictionary to hold top words for each cluster
top_words_per_cluster = defaultdict(list)

# Number of top words you want to display per cluster
n_top_words = 5

# Iterate over each cluster and get top words
for cluster_idx, tfidf_scores in enumerate(tfidf_matrix):
    # Get indices of top n words within the cluster
    top_word_indices = tfidf_scores.toarray()[0].argsort()[:-n_top_words - 1:-1]
    
    # Get the top words corresponding to the top indices
    top_words = [feature_names[i] for i in top_word_indices]
    
    # Append the words to the dictionary
    top_words_per_cluster[cluster_df.iloc[cluster_idx]['cluster']] = top_words

# Print the top words for each cluster
cluster_names = [f"Cluster {cluster}: {', '.join(words)}" for cluster, words in top_words_per_cluster.items()]



In [None]:
survey_clustered_df = (
    survey_viz_df
    .assign(cluster_name=lambda x: x.cluster.map(dict(zip(top_words_per_cluster.keys(), cluster_names))))
)

In [None]:
survey_clustered_df.to_csv("data/survey_clustered.csv", index=False)

In [None]:
(
    survey_clustered_df
    .groupby(['cluster_name', 'policy_area'])
    .agg({'value': 'count'})
    .reset_index()
).to_csv("data/survey_clustered_counts.csv", index=False)