In [6]:
import os

import lucem_illud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn
import sklearn

%matplotlib inline

# Constants, Utility Functions, and Data Importing

In [4]:
# Constants
LOW_NOISE = 0.2
HIGH_NOISE = 0.45
TEST_SPLIT = 0.2

In [7]:
# Utility Functions
def plotter(df: pd.DataFrame, category_key: str = "category"):
    fig, ax = plt.subplots(figsize=(10, 10))
    pallet = seaborn.color_palette(
        palette="rainbow", n_colors=len(set(df[category_key]))
    )
    for i, cat in enumerate(set(df[category_key])):
        a = np.stack(df[df[category_key] == cat]["vect"])
        ax.scatter(a[:, 0], a[:, 1], c=pallet[i], label=cat)
    ax.legend(loc="center right", title="Categories")
    ax.axis("off")
    plt.show()
    plt.close()


def pca_split_stats(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    red_pca_key: str,
    category_key: str = "category",
) -> None:
    logistic = sklearn.linear_model.LogisticRegression()
    train_df[red_pca_key] = train_df["pca"].apply(lambda x: x[:400])
    test_df[red_pca_key] = test_df["pca"].apply(lambda x: x[:400])

    logistic.fit(np.stack(train_df[red_pca_key], axis=0), train_df[category_key])

    print("Training:")
    print(
        logistic.score(np.stack(train_df[red_pca_key], axis=0), train_df[category_key])
    )
    print("Testing:")
    print(logistic.score(np.stack(test_df[red_pca_key], axis=0), test_df[category_key]))

In [10]:
# Data is sourced from a personal project of mine you can find here:
# https://ucpd-incident-reporter-7cfdc3369124.herokuapp.com/
ucpd_feather_path = "data/fully_tokenized_ucpd_incidents.feather"
if os.path.isfile(ucpd_feather_path):
    ucpd_reports = pd.read_feather(ucpd_feather_path)
else:
    # WARNING: This step takes about 120 minutes, so don't run it unless you need to.
    ucpd_reports = pd.read_csv("data/incident_dump.csv")
    ucpd_reports["tokenized_text"] = ucpd_reports["comments"].apply(
        lucem_illud.word_tokenize
    )
    ucpd_reports["normalized_text"] = ucpd_reports["tokenized_sentences"].apply(
        lucem_illud.normalizeTokens
    )
    ucpd_reports.to_feather(ucpd_feather_path)
ucpd_reports.head(5)



KeyError: 'tokenized_sentences'

## <font color="red">*Exercise 1*</font>

<font color="red">Perform a content annotation survey of some kind in which at 
least 3 people evaluate and code each piece of content, using Amazon Mechanical 
Turk as described in the [MTurk slides on Canvas](https://canvas.uchicago.edu/courses/54694/files/folder/unfiled?preview=10675152), or by hand with friends.  
With the resulting data, calculate, visualize and discuss inter-coder agreement or 
co-variation with appropriate metrics. What does this means for the reliability of 
human assessments regarding content in your domain?

In [21]:
# Figure out a way to chunk up the data

## <font color="red">*Exercise 2*</font>

<font color="red">Go back through all the cells above and generate 10 distinct 
artificial datasets and classify them with all the available methods. Add a cell 
immediately below and describe which classifier(s) worked best with which 
artificially constructed data source and why. Then go through all the empirical 
datasets (i.e., Newsgroups, Senate Small, Senate Large, Email Spam) and classify 
them with all available methods. Add a second cell immediately below and describe 
which classifier(s) worked best with which data set and why.

<font color="red">***Stretch*** (but also required) Wander through the SKLearn 
documentation available [here](http://scikit-learn.org/stable/), particularly 
perusing the classifiers. In cells following, identify and implement a new classifier 
that we have not yet used (e.g., AdaBoost, CART) on one artificial dataset and one real 
dataset (used above). Then, in the next cell describe the classifier, detail how it 
compares with the approaches above, and why it performed better or worse than others.

### Generated Datasets

In [None]:
# Dataset 1
ln_random_df_train, ln_random_df_test = sklearn.model_selection.train_test_split(
    lucem_illud.random(LOW_NOISE), test_size=TEST_SPLIT
)
# Dataset 2
ln_and_split_df_train, ln_and_split_df_test = sklearn.model_selection.train_test_split(
    lucem_illud.andSplit(LOW_NOISE), test_size=TEST_SPLIT
)
# Dataset 3
ln_xor_split_df_train, ln_xor_split_df_test = sklearn.model_selection.train_test_split(
    lucem_illud.xorSplit(LOW_NOISE), test_size=TEST_SPLIT
)
# Dataset 4
(
    ln_target_split_df_train,
    ln_target_split_df_test,
) = sklearn.model_selection.train_test_split(
    lucem_illud.targetSplit(LOW_NOISE), test_size=TEST_SPLIT
)
# Dataset 5
(
    ln_multi_blobs_df_train,
    ln_multi_blobs_df_test,
) = sklearn.model_selection.train_test_split(
    lucem_illud.multiBlobs(LOW_NOISE), test_size=TEST_SPLIT
)

In [None]:
# Dataset 6
hn_random_df_train, hn_random_df_test = sklearn.model_selection.train_test_split(
    lucem_illud.random(HIGH_NOISE), test_size=TEST_SPLIT
)
# Dataset 7
hn_and_split_df_train, hn_and_split_df_test = sklearn.model_selection.train_test_split(
    lucem_illud.andSplit(HIGH_NOISE), test_size=TEST_SPLIT
)
# Dataset 8
hn_xor_split_df_train, hn_xor_split_df_test = sklearn.model_selection.train_test_split(
    lucem_illud.xorSplit(HIGH_NOISE), test_size=TEST_SPLIT
)
# Dataset 9
(
    hn_target_split_df_train,
    hn_target_split_df_test,
) = sklearn.model_selection.train_test_split(
    lucem_illud.targetSplit(HIGH_NOISE), test_size=TEST_SPLIT
)
# Dataset 10
(
    hn_multi_blobs_df_train,
    hn_multi_blobs_df_test,
) = sklearn.model_selection.train_test_split(
    lucem_illud.multiBlobs(HIGH_NOISE), test_size=TEST_SPLIT
)

### Empirical Datasets

## <font color="red">*Exercise 3*</font>

<font color="red">In the cells immediately following, perform logistic regression 
classification using training, testing and un-coded (i.e., data you didn't code by 
hand but want to use your model on) data from texts and hand-classifications 
associated with your final project (e.g., these could be crowd-sourced codes 
gathered through Amazon Mechanical Turk in Exercise 1). Visualize the confusion 
matrix for training and testing sets. Calculate precision, recall, the F-measure, 
and AUC, then perform an ROC visualization. How do these classifiers perform? 
Extrapolate code from these models to all un-coded data.

## <font color="red">*Exercise 4*</font>

<font color="red">In the cells immediately following, perform decision tree and 
random forest classification (binary, multinomial or continuous) using training, 
testing and extrapolation (un-coded) data from texts and hand-classifications 
associated with your final project. As with ***Exercise 2***, these could be 
crowdsourced codes gathered through Amazon Mechanical Turk last week. Visualize 
the classification of data points. Calculate relevant metrics (e.g., precision, 
recall, the F-measure, and AUC). Now build an ensemble classifier by bagging trees 
into a random forest. Visualize the result. How do these classifiers perform? 
What does ensemble learning do?

## <font color="red">*Exercise 6*</font>

<font color="red">In the cells immediately following, perform a neural network 
classification and calculate relevant metrics (e.g., precision, recall, the 
F-measure, and AUC). How does this classify relevant to *k*-nearest neighbor, 
logistic and decision-tree approaches?

## <font color="red">*Exercise 7*</font>

<font color="red">In the cells immediately following, use the pipeline functions 
or the word or sentence vector functions (e.g., similarity) to explore the social 
game underlying the production and meaning of texts associated with your final project. 
How does BERT help you gain insight regarding your research question that is similar 
and different from prior methods?