In [22]:
import math
from functools import reduce
from itertools import permutations

import lucem_illud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn
import sklearn

%matplotlib inline

# Constants, Utility Functions, and Data Importing

In [18]:
# Constants

In [19]:
# Utility Functions


def pairwise_metric_average(metric, array):
    """
    Calculate the pairwise metric average for the real
    elements of metric function run on an array of annotations.
    """
    p = permutations(range(array[0, :].size), 2)
    m = [metric(array[:, x[0]], array[:, x[1]]) for x in p]
    clean_m = [c for c in m if not math.isnan(c)]
    return reduce(lambda a, b: a + b, clean_m) / len(clean_m)


def random(num_per_category=500):
    datDict = {
        "vect": [np.random.rand(2) * 2 - 1 for i in range(2 * num_per_category)],
        "category": [i % 2 for i in range(2 * num_per_category)],
    }

    return pd.DataFrame(datDict)


def and_split(noise=0, num_per_category=500):
    def gen_point(cat):
        y = np.random.random_sample() * 2 - 1
        if noise >= 0:
            x = (
                np.random.random_sample()
                - cat
                - (np.random.random_sample() - cat) * noise
            )
        else:
            x = (1 - noise * np.random.random_sample()) - cat
        return np.array([x, y])

    datDict = {
        "vect": [gen_point(i % 2) for i in range(2 * num_per_category)],
        "category": [i % 2 for i in range(2 * num_per_category)],
    }

    return pd.DataFrame(datDict)


def xor_split(noise=0, num_per_category=500):
    def gen_point(cat):
        if cat == 1:
            if np.random.randint(0, 2) < 1:
                y = np.random.random_sample() - np.random.random_sample() * noise * 2
                x = np.random.random_sample() - np.random.random_sample() * noise * 2
            else:
                y = (
                    np.random.random_sample()
                    - 1
                    - np.random.random_sample() * noise * 2
                )
                x = (
                    np.random.random_sample()
                    - 1
                    - np.random.random_sample() * noise * 2
                )
        else:
            if np.random.randint(0, 2) < 1:
                y = (
                    np.random.random_sample()
                    - 1
                    - np.random.random_sample() * noise * 2
                )
                x = np.random.random_sample() - np.random.random_sample() * noise * 2
            else:
                y = np.random.random_sample() - np.random.random_sample() * noise * 2
                x = (
                    np.random.random_sample()
                    - 1
                    - np.random.random_sample() * noise * 2
                )
        return np.array([x, y])

    datDict = {
        "vect": [gen_point(i % 2) for i in range(2 * num_per_category)],
        "category": [i % 2 for i in range(2 * num_per_category)],
    }

    return pd.DataFrame(datDict)


def target_split(noise=0, num_per_category=500, inner_rad=0.3):
    def gen_point(cat):
        if cat == 0:
            r = (
                np.random.random_sample() * inner_rad
                + (1 - inner_rad) * np.random.random_sample() * noise
            )
        else:
            r = (
                np.random.random_sample() * (1 - inner_rad)
                + inner_rad
                - inner_rad * np.random.random_sample() * noise
            )
        eta = 2 * np.pi * np.random.random_sample()
        return np.array([r * np.cos(eta), r * np.sin(eta)])

    datDict = {
        "vect": [gen_point(i % 2) for i in range(2 * num_per_category)],
        "category": [i % 2 for i in range(2 * num_per_category)],
    }

    return pd.DataFrame(datDict)


def multi_blobs(noise=0, num_per_category=500, centers=5):
    if isinstance(centers, int):
        n_samples = num_per_category * centers
    else:
        n_samples = num_per_category * len(centers)
    X, y = sklearn.datasets.make_blobs(
        n_samples=n_samples, centers=centers, cluster_std=(0.8 * (noise * 2 + 1))
    )
    datDict = {
        "vect": list(X),
        "category": y,
    }
    return pd.DataFrame(datDict)


def plotter(df: pd.DataFrame, category_key: str = "category"):
    fig, ax = plt.subplots(figsize=(10, 10))
    pallet = seaborn.color_palette(
        palette="rainbow", n_colors=len(set(df[category_key]))
    )
    for i, cat in enumerate(set(df[category_key])):
        a = np.stack(df[df[category_key] == cat]["vect"])
        ax.scatter(a[:, 0], a[:, 1], c=pallet[i], label=cat)
    ax.legend(loc="center right", title="Categories")
    ax.axis("off")
    plt.show()
    plt.close()


def pca_split_stats(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    red_pca_key: str,
    category_key: str = "category",
) -> None:
    logistic = sklearn.linear_model.LogisticRegression()
    train_df[red_pca_key] = train_df["pca"].apply(lambda x: x[:400])
    test_df[red_pca_key] = test_df["pca"].apply(lambda x: x[:400])

    logistic.fit(np.stack(train_df[red_pca_key], axis=0), train_df[category_key])

    print("Training:")
    print(
        logistic.score(np.stack(train_df[red_pca_key], axis=0), train_df[category_key])
    )
    print("Testing:")
    print(logistic.score(np.stack(test_df[red_pca_key], axis=0), test_df[category_key]))

In [20]:
# Data is sourced from a personal project of mine you can find here:
# https://ucpd-incident-reporter-7cfdc3369124.herokuapp.com/
ucpd_reports = pd.read_csv("data/incident_dump.csv")
ucpd_reports.head(5)

Unnamed: 0,comments,disposition,incident,location,occurred,predicted_incident,reported,reported_date,ucpd_id,validated_address,validated_location
0,A person was transported to Comer Hospital by ...,Closed,Mental Health Transport,6300 S. University Ave. (S. Woodlawn Ave. Char...,2/1/24 10:10 AM,,2024-02-01T10:10:00-06:00,2024-02-01,24-00114,"6300 S UNIVERSITY AVE, CHICAGO, IL, 60637","41.78045407997166,-87.59732203570559"
1,Catalytic converter taken from a 2015 Toyota p...,Open,Theft from Motor Vehicle,1210 E. 57th St. (Public Way),1/31/24 to 2/1/24 6:00 PM to 7:00 AM,,2024-02-01T10:18:00-06:00,2024-02-01,24-00115,"1210 E 57TH ST, CHICAGO, IL, 60637","41.79150658678615,-87.59602168542061"
2,Debit and credit cards taken from wallet in un...,Open,Theft,900 E. 57th St. (Knapp Center),2/1/24 9:50 AM to 4:30 PM,,2024-02-01T17:54:00-06:00,2024-02-01,24-00116,"900 E 57TH ST, CHICAGO, IL, 60637","41.791423455510476,-87.60366291896175"
3,Boyfriend battered girlfriend in off-campus pr...,Open,Domestic Battery,6040 S. Harper Ave. (Apt. Building),2/1/24 2:45 PM,,2024-02-01T15:45:00-06:00,2024-02-01,24-00117,"6040 S HARPER AVE, CHICAGO, IL, 60637","41.78472618578524,-87.58821677767634"
4,A known suspect entered the off-campus store a...,Referred,Information / Theft,1346 E. 53rd St. (Target),1/31/24 12:15 PM,,2024-01-31T14:16:00-06:00,2024-01-31,2024-004118,"1346 E 53RD ST, CHICAGO, IL, 60615","41.79955044222366,-87.593062823983"


In [23]:
ucpd_reports["tokenized_text"] = ucpd_reports["comments"].apply(
    lucem_illud.word_tokenize
)
ucpd_reports["normalized_text"] = ucpd_reports["tokenized_text"].apply(
    lucem_illud.normalizeTokens
)



## <font color="red">*Exercise 1*</font>

<font color="red">Perform a content annotation survey of some kind in which at 
least 3 people evaluate and code each piece of content, using Amazon Mechanical 
Turk as described in the [MTurk slides on Canvas](https://canvas.uchicago.edu/courses/54694/files/folder/unfiled?preview=10675152), or by hand with friends.  
With the resulting data, calculate, visualize and discuss inter-coder agreement or 
co-variation with appropriate metrics. What does this means for the reliability of 
human assessments regarding content in your domain?

In [21]:
# Figure out a way to chunk up the data

## <font color="red">*Exercise 2*</font>

<font color="red">Go back through all the cells above and generate 10 distinct 
artificial datasets and classify them with all the available methods. Add a cell 
immediately below and describe which classifier(s) worked best with which 
artificially constructed data source and why. Then go through all the empirical 
datasets (i.e., Newsgroups, Senate Small, Senate Large, Email Spam) and classify 
them with all available methods. Add a second cell immediately below and describe 
which classifier(s) worked best with which data set and why.

<font color="red">***Stretch*** (but also required) Wander through the SKLearn 
documentation available [here](http://scikit-learn.org/stable/), particularly 
perusing the classifiers. In cells following, identify and implement a new classifier 
that we have not yet used (e.g., AdaBoost, CART) on one artificial dataset and one real 
dataset (used above). Then, in the next cell describe the classifier, detail how it 
compares with the approaches above, and why it performed better or worse than others.

## <font color="red">*Exercise 3*</font>

<font color="red">In the cells immediately following, perform logistic regression 
classification using training, testing and un-coded (i.e., data you didn't code by 
hand but want to use your model on) data from texts and hand-classifications 
associated with your final project (e.g., these could be crowd-sourced codes 
gathered through Amazon Mechanical Turk in Exercise 1). Visualize the confusion 
matrix for training and testing sets. Calculate precision, recall, the F-measure, 
and AUC, then perform an ROC visualization. How do these classifiers perform? 
Extrapolate code from these models to all un-coded data.

## <font color="red">*Exercise 4*</font>

<font color="red">In the cells immediately following, perform decision tree and 
random forest classification (binary, multinomial or continuous) using training, 
testing and extrapolation (un-coded) data from texts and hand-classifications 
associated with your final project. As with ***Exercise 2***, these could be 
crowdsourced codes gathered through Amazon Mechanical Turk last week. Visualize 
the classification of data points. Calculate relevant metrics (e.g., precision, 
recall, the F-measure, and AUC). Now build an ensemble classifier by bagging trees 
into a random forest. Visualize the result. How do these classifiers perform? 
What does ensemble learning do?

## <font color="red">*Exercise 6*</font>

<font color="red">In the cells immediately following, perform a neural network 
classification and calculate relevant metrics (e.g., precision, recall, the 
F-measure, and AUC). How does this classify relevant to *k*-nearest neighbor, 
logistic and decision-tree approaches?

## <font color="red">*Exercise 7*</font>

<font color="red">In the cells immediately following, use the pipeline functions 
or the word or sentence vector functions (e.g., similarity) to explore the social 
game underlying the production and meaning of texts associated with your final project. 
How does BERT help you gain insight regarding your research question that is similar 
and different from prior methods?