## Using vectorization to generate features


In [None]:
import pandas as pd 
import spacy
import umap
import numpy as np 
from io import BytesIO
from PIL import Image
import base64
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10, Category10
from pathlib import Path
import sys
sys.path.append("..")
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

from ml_editor.data_processing import format_raw_df, get_split_by_author, get_normalized_series, add_text_features_to_df
from ml_editor.data_visualization import plot_embeddings


df=pd.read_csv(Path("D:\Project 1\data\writers.csv"))
df=format_raw_df(df.copy())

train_author, test_author=get_split_by_author(df[df["is_question"]])

questions=train_author[train_author["is_question"]]
raw_text=questions["body_text"]
sent_labels=questions["AcceptedAnswerId"].notna()


nlp=spacy.load("en_core_web_lg", disable=["parser", "tagger", "ner"])

spacy_emb=train_author[train_author["is_question"]]["body_text"].apply(lambda x: nlp(x).vector)
embeddings=np.vstack(spacy_emb)

umap_embedder=umap.UMAP()
umap_emb=umap_embedder.fit_transform(embeddings)

## Interactive plot

In [None]:
output_notebook()

def get_interactive_umap_embeddings_plot(umap_vectors, labels, text, legends, tooltip_label=None):
    if not tooltip_label:
        print("Using standard label")
        tooltip_label = labels
    w2v_df = pd.DataFrame(umap_vectors, columns=('x', 'y'))
    print(len(w2v_df))
    w2v_df['label'] = [str(x) for x in labels]
    w2v_df['tooltip_label'] = [str(x) for x in tooltip_label]
    w2v_df['text'] = list(text)
    w2v_df['legends'] = ["Answered" if el else "Unanswered" for el in list(legends)]
    datasource = ColumnDataSource(w2v_df)

    color_mapping = CategoricalColorMapper(factors=['True','False'], palette=['#1f77b4', '#ff7f0e'])

    TOOLTIPS = [
        ("text", "@text"),
        ('got_answer', '@tooltip_label')
    ]
    hover = HoverTool(tooltips=TOOLTIPS)
    hover.attachment ='right'

    plot_figure = figure(
        title='UMAP projection of questions',
        plot_width=900,
        plot_height=600,
        tools=('pan, wheel_zoom, reset', 'box_zoom', 'undo')
    )
    plot_figure.add_tools(hover)
    
    plot_figure.circle(
        'x',
        'y',
        source=datasource,
        color=dict(field='label', transform=color_mapping),
        legend='legends',
        line_alpha=0,
        fill_alpha=0.4,
        size=5
    )
    return plot_figure

plot_figure = get_interactive_umap_embeddings_plot(umap_emb, sent_labels, raw_text, legends=sent_labels)
show(plot_figure)

In [1]:
# Fill empty rows to allow search to still perform
df["body_text_question"].fillna("", inplace=True)

def show_question_features_containing(text):
    return df[df["body_text_question"].str.contains(text)][["body_text", "CommentCount",
                                                            "body_text_question",
                                                            "Score_question", "AcceptedAnswerId_question"]]

# Good example of two similar questions
show_question_features_containing("I'm an amateur writer")

NameError: name 'df' is not defined


## Potential features
#### Looking through the embeddings and associated rows of data above, it seemed like a few features were predictive of the target class of a question. The ones I idntified were:

#### question length: very short questions tended to not get answers
#### presence of question mark: the absence of a question mark seemed to lower the chance of an answer
#### vocabulary associated with a clear question (action verbs, etc...): unansweredquestions seemed to be missing those
#### Did you identify any others? If so, feel free to add them as well.

#### We start by creating a feature for the presence of question marks and action verbs

In [None]:
df["action_verb"] = (df["body_text"].str.contains("can", regex=False) | df["body_text"].str.contains("What", regex=False) | df["body_text"].str.contains("should", regex=False))
df["question_mark"] = df["body_text"].str.contains("?", regex=False)
df["text_len"] = df["body_text"].str.len()

In [None]:
df["action_verb"].value_counts()

In [None]:
df["question_mark"].value_counts()

In [None]:
df["norm_text_len"]= get_normalized_series(df, "text_len")

In [None]:
train_author, test_author = get_split_by_author(df[df["is_question"]])

In [None]:
vectorized_features = np.append(np.array(embeddings), train_author[train_author["is_question"]][["action_verb","question_mark", 
                                                                            "norm_text_len"]], 1)
vectorized_features.shape

In [None]:
umap_embedder = umap.UMAP()
umap_features = umap_embedder.fit_transform(vectorized_features)

In [None]:

plot_embeddings(umap_features, sent_labels)

In [None]:

plot_figure = get_interactive_umap_embeddings_plot(umap_features, sent_labels, raw_text, legends=sent_labels)
show(plot_figure)

In [None]:
df["language_question"] = (df["body_text"].str.contains("punctuate", regex=False) | df["body_text"].str.contains("capitalize", regex=False) | df["body_text"].str.contains("abbreviate", regex=False)).astype(int)

In [None]:
df[df["body_text"].str.contains("Specifically, how to describe", regex=False)][["body_text", "Title"]]

In [None]:
df["full_text"] = df["Title"].str.cat(df["body_text"], sep=' ', na_rep='')

In [None]:
df = add_text_features_to_df(df.loc[df["is_question"]].copy())

In [None]:
train_author, test_author = get_split_by_author(df[df["is_question"]])
train_labels  = train_author["AcceptedAnswerId"].notna()

train_author["vectors"] = train_author["full_text"].apply(lambda x: nlp(x).vector)

In [None]:
vectorized_features = np.append(
        np.vstack(train_author["vectors"]),
        train_author[
            [
                "action_verb_full",
                "question_mark_full",
                "norm_text_len",
                "language_question",
            ]
        ],
        1,
    )

In [None]:

umap_embedder = umap.UMAP()
umap_features = umap_embedder.fit_transform(vectorized_features)

In [None]:
plot_embeddings(umap_features, train_labels)

In [None]:
plot_figure = get_interactive_umap_embeddings_plot(umap_features, sent_labels, raw_text, legends=sent_labels)
show(plot_figure)