In [36]:
import os

import lucem_illud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn
import sklearn
import sklearn.ensemble
import sklearn.naive_bayes
import sklearn.neighbors
import sklearn.neural_network
import sklearn.tree

%matplotlib inline

# Constants, Utility Functions, and Data Importing

In [138]:
# Constants
NOISE = 0.2
TEST_SPLIT = 0.2
NUM_PER_CATEGORY = 750

In [139]:
# Utility Functions


def plotter(df: pd.DataFrame, category_key: str = "category"):
    fig, ax = plt.subplots(figsize=(10, 10))
    pallet = seaborn.color_palette(
        palette="rainbow", n_colors=len(set(df[category_key]))
    )
    for i, cat in enumerate(set(df[category_key])):
        a = np.stack(df[df[category_key] == cat]["vect"])
        ax.scatter(a[:, 0], a[:, 1], c=pallet[i], label=cat)
    ax.legend(loc="center right", title="Categories")
    ax.axis("off")
    plt.show()
    plt.close()


def pca_split_stats(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    red_pca_key: str,
    category_key: str = "category",
) -> None:
    logistic = sklearn.linear_model.LogisticRegression()
    train_df[red_pca_key] = train_df["pca"].apply(lambda x: x[:400])
    test_df[red_pca_key] = test_df["pca"].apply(lambda x: x[:400])

    logistic.fit(np.stack(train_df[red_pca_key], axis=0), train_df[category_key])

    print("Training:")
    print(
        logistic.score(np.stack(train_df[red_pca_key], axis=0), train_df[category_key])
    )
    print("Testing:")
    print(logistic.score(np.stack(test_df[red_pca_key], axis=0), test_df[category_key]))


def classifier_evaluation(train_df: pd.DataFrame, test_df: pd.DataFrame) -> None:
    classifiers = [
        (sklearn.naive_bayes.GaussianNB(), "Gaussian"),
        (sklearn.svm.SVC(kernel="linear", probability=True), "Linear SVC"),
        (
            sklearn.svm.SVC(kernel="poly", degree=3, probability=True),
            "Poly SVC",
        ),
        (
            sklearn.neighbors.KNeighborsClassifier(5, weights="distance"),
            "KNN",
        ),
        (sklearn.linear_model.LogisticRegression(), "Logistic Regression"),
        (sklearn.tree.DecisionTreeClassifier(), "Decision Tree"),
        (sklearn.ensemble.RandomForestClassifier(), "Random Forest"),
        (sklearn.neural_network.MLPClassifier(), "MLP"),
        (sklearn.ensemble.GradientBoostingClassifier(), "Gradient Boosting"),
    ]

    metrics_df = pd.DataFrame(
        columns=[
            "Classifier",
            "True Pos",
            "False Pos",
            "True Neg",
            "False Neg",
        ]
    )

    for clf, name in classifiers:
        clf.fit(np.stack(train_df["vect"], axis=0), train_df["category"])
        # lucem_illud.evaluateClassifier(clf, test_df)
        predictions = clf.predict(np.stack(test_df["vect"], axis=0))
        mat = sklearn.metrics.confusion_matrix(test_df["category"], predictions)
        metrics_df = pd.concat(
            [
                metrics_df,
                pd.DataFrame(
                    [
                        {
                            "Classifier": name,
                            "True Pos": mat[0][0],
                            "False Pos": mat[0][1],
                            "True Neg": mat[1][0],
                            "False Neg": mat[1][1],
                        }
                    ]
                ),
            ],
            ignore_index=True,
        )
        # lucem_illud.plotConfusionMatrix(clf, test_df)
        # lucem_illud.plotMultiROC(clf, test_df)
        # lucem_illud.plotregions(clf, train_df)
    print(metrics_df)

In [140]:
# Data is sourced from a personal project of mine you can find here:
# https://ucpd-incident-reporter-7cfdc3369124.herokuapp.com/
ucpd_feather_path = "data/fully_tokenized_ucpd_incidents.feather"
if os.path.isfile(ucpd_feather_path):
    ucpd_reports = pd.read_feather(ucpd_feather_path)
else:
    # WARNING: This step takes about 120 minutes, so don't run it unless you need to.
    ucpd_reports = pd.read_csv("data/incident_dump.csv")
    ucpd_reports["tokenized_text"] = ucpd_reports["comments"].apply(
        lucem_illud.word_tokenize
    )
    ucpd_reports["normalized_text"] = ucpd_reports["tokenized_text"].apply(
        lucem_illud.normalizeTokens
    )
    ucpd_reports.to_feather(ucpd_feather_path)
ucpd_reports.head(5)

Unnamed: 0,comments,disposition,incident,location,occurred,predicted_incident,reported,reported_date,ucpd_id,validated_address,validated_location,tokenized_text,normalized_text
0,A person was transported to Comer Hospital by ...,Closed,Mental Health Transport,6300 S. University Ave. (S. Woodlawn Ave. Char...,2/1/24 10:10 AM,,2024-02-01T10:10:00-06:00,2024-02-01,24-00114,"6300 S UNIVERSITY AVE, CHICAGO, IL, 60637","41.78045407997166,-87.59732203570559","[A, person, was, transported, to, Comer, Hospi...","[person, transport, comer, hospital, cfd, ems,..."
1,Catalytic converter taken from a 2015 Toyota p...,Open,Theft from Motor Vehicle,1210 E. 57th St. (Public Way),1/31/24 to 2/1/24 6:00 PM to 7:00 AM,,2024-02-01T10:18:00-06:00,2024-02-01,24-00115,"1210 E 57TH ST, CHICAGO, IL, 60637","41.79150658678615,-87.59602168542061","[Catalytic, converter, taken, from, a, 2015, T...","[catalytic, converter, take, toyota, park, str..."
2,Debit and credit cards taken from wallet in un...,Open,Theft,900 E. 57th St. (Knapp Center),2/1/24 9:50 AM to 4:30 PM,,2024-02-01T17:54:00-06:00,2024-02-01,24-00116,"900 E 57TH ST, CHICAGO, IL, 60637","41.791423455510476,-87.60366291896175","[Debit, and, credit, cards, taken, from, walle...","[debit, credit, card, take, wallet, unattended..."
3,Boyfriend battered girlfriend in off-campus pr...,Open,Domestic Battery,6040 S. Harper Ave. (Apt. Building),2/1/24 2:45 PM,,2024-02-01T15:45:00-06:00,2024-02-01,24-00117,"6040 S HARPER AVE, CHICAGO, IL, 60637","41.78472618578524,-87.58821677767634","[Boyfriend, battered, girlfriend, in, off, cam...","[boyfriend, batter, girlfriend, campus, privat..."
4,A known suspect entered the off-campus store a...,Referred,Information / Theft,1346 E. 53rd St. (Target),1/31/24 12:15 PM,,2024-01-31T14:16:00-06:00,2024-01-31,2024-004118,"1346 E 53RD ST, CHICAGO, IL, 60615","41.79955044222366,-87.593062823983","[A, known, suspect, entered, the, off, campus,...","[know, suspect, enter, campus, store, take, me..."


## <font color="red">*Exercise 1*</font>

<font color="red">Perform a content annotation survey of some kind in which at 
least 3 people evaluate and code each piece of content, using Amazon Mechanical 
Turk as described in the [MTurk slides on Canvas](https://canvas.uchicago.edu/courses/54694/files/folder/unfiled?preview=10675152), or by hand with friends.  
With the resulting data, calculate, visualize and discuss inter-coder agreement or 
co-variation with appropriate metrics. What does this means for the reliability of 
human assessments regarding content in your domain?

In [141]:
# Figure out a way to chunk up the data

## <font color="red">*Exercise 2*</font>

<font color="red">Go back through all the cells above and generate 10 distinct 
artificial datasets and classify them with all the available methods. Add a cell 
immediately below and describe which classifier(s) worked best with which 
artificially constructed data source and why. Then go through all the empirical 
datasets (i.e., Newsgroups, Senate Small, Senate Large, Email Spam) and classify 
them with all available methods. Add a second cell immediately below and describe 
which classifier(s) worked best with which data set and why.

<font color="red">***Stretch*** (but also required) Wander through the SKLearn 
documentation available [here](http://scikit-learn.org/stable/), particularly 
perusing the classifiers. In cells following, identify and implement a new classifier 
that we have not yet used (e.g., AdaBoost, CART) on one artificial dataset and one real 
dataset (used above). Then, in the next cell describe the classifier, detail how it 
compares with the approaches above, and why it performed better or worse than others.

### Generated Datasets

In [142]:
# Dataset 1
classifier_evaluation(
    *sklearn.model_selection.train_test_split(
        lucem_illud.random(), test_size=TEST_SPLIT
    )
)

            Classifier True Pos False Pos True Neg False Neg
0             Gaussian       57        38       59        46
1           Linear SVC       49        46       47        58
2             Poly SVC       69        26       75        30
3                  KNN       52        43       53        52
4  Logistic Regression       52        43       55        50
5        Decision Tree       53        42       56        49
6        Random Forest       50        45       62        43
7                  MLP       46        49       54        51
8    Gradient Boosting       52        43       53        52


In [143]:
# Dataset 2
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.andSplit(NOISE), test_size=TEST_SPLIT
))

            Classifier True Pos False Pos True Neg False Neg
0             Gaussian       90        16        9        85
1           Linear SVC       88        18        7        87
2             Poly SVC       82        24        6        88
3                  KNN       84        22       12        82
4  Logistic Regression       91        15        9        85
5        Decision Tree       84        22        9        85
6        Random Forest       90        16       13        81
7                  MLP       92        14        9        85
8    Gradient Boosting       87        19       12        82


In [152]:
# Dataset 3
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.xorSplit(NOISE), test_size=TEST_SPLIT
))

            Classifier True Pos False Pos True Neg False Neg
0             Gaussian       42        62       31        65
1           Linear SVC       38        66       13        83
2             Poly SVC       60        44       13        83
3                  KNN       95         9       13        83
4  Logistic Regression       43        61       39        57
5        Decision Tree       90        14       12        84
6        Random Forest       92        12       14        82
7                  MLP       98         6       16        80
8    Gradient Boosting       95         9       14        82




In [153]:
# Dataset 4
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.targetSplit(NOISE), test_size=TEST_SPLIT
))

            Classifier True Pos False Pos True Neg False Neg
0             Gaussian       89         6       15        90
1           Linear SVC       92         3       79        26
2             Poly SVC       95         0       91        14
3                  KNN       87         8       15        90
4  Logistic Regression       60        35       61        44
5        Decision Tree       83        12       17        88
6        Random Forest       83        12       12        93
7                  MLP       90         5       17        88
8    Gradient Boosting       85        10       11        94




In [154]:
# Dataset 5
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.multiBlobs(NOISE), test_size=TEST_SPLIT
))



            Classifier True Pos False Pos True Neg False Neg
0             Gaussian       91         0        0        90
1           Linear SVC       91         0        0        90
2             Poly SVC       91         0        0        90
3                  KNN       91         0        0        90
4  Logistic Regression       91         0        0        90
5        Decision Tree       91         0        0        90
6        Random Forest       91         0        0        90
7                  MLP       91         0        0        90
8    Gradient Boosting       91         0        0        90


In [147]:
# Dataset 6
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.random(NUM_PER_CATEGORY), test_size=TEST_SPLIT
))

            Classifier True Pos False Pos True Neg False Neg
0             Gaussian       84        63       81        72
1           Linear SVC       71        76       79        74
2             Poly SVC      102        45      105        48
3                  KNN       76        71       84        69
4  Logistic Regression       86        61       87        66
5        Decision Tree       72        75       84        69
6        Random Forest       70        77       81        72
7                  MLP      107        40      103        50
8    Gradient Boosting       80        67       88        65


In [148]:
# Dataset 7
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.andSplit(NOISE, NUM_PER_CATEGORY), test_size=TEST_SPLIT
))

            Classifier True Pos False Pos True Neg False Neg
0             Gaussian      139        12        8       141
1           Linear SVC      140        11        8       141
2             Poly SVC      134        17        5       144
3                  KNN      132        19       17       132
4  Logistic Regression      139        12        8       141
5        Decision Tree      135        16       14       135
6        Random Forest      136        15       15       134
7                  MLP      140        11        9       140
8    Gradient Boosting      138        13       13       136


In [149]:
# Dataset 8
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.xorSplit(NOISE, NUM_PER_CATEGORY), test_size=TEST_SPLIT
))

            Classifier True Pos False Pos True Neg False Neg
0             Gaussian       70        77       69        84
1           Linear SVC       68        79       29       124
2             Poly SVC       88        59       27       126
3                  KNN      125        22       15       138
4  Logistic Regression       81        66       97        56
5        Decision Tree      124        23       17       136
6        Random Forest      125        22       14       139
7                  MLP      129        18       13       140
8    Gradient Boosting      131        16       16       137


In [150]:
# Dataset 9
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.targetSplit(NOISE, NUM_PER_CATEGORY), test_size=TEST_SPLIT
))

            Classifier True Pos False Pos True Neg False Neg
0             Gaussian      147         3       22       128
1           Linear SVC      145         5      107        43
2             Poly SVC      150         0      130        20
3                  KNN      141         9       21       129
4  Logistic Regression       71        79       81        69
5        Decision Tree      137        13       20       130
6        Random Forest      145         5       18       132
7                  MLP      146         4       21       129
8    Gradient Boosting      146         4       21       129




In [151]:
# Dataset 10
classifier_evaluation(*sklearn.model_selection.train_test_split(
    lucem_illud.multiBlobs(NOISE, NUM_PER_CATEGORY), test_size=TEST_SPLIT
))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


            Classifier True Pos False Pos True Neg False Neg
0             Gaussian      157         0        1       127
1           Linear SVC      156         0        1       126
2             Poly SVC      157         0        5       124
3                  KNN      156         0        1       129
4  Logistic Regression      156         0        1       127
5        Decision Tree      155         0        1       122
6        Random Forest      156         0        1       124
7                  MLP      156         0        1       127
8    Gradient Boosting      156         1        1       126


### Empirical Datasets

## <font color="red">*Exercise 3*</font>

<font color="red">In the cells immediately following, perform logistic regression 
classification using training, testing and un-coded (i.e., data you didn't code by 
hand but want to use your model on) data from texts and hand-classifications 
associated with your final project (e.g., these could be crowd-sourced codes 
gathered through Amazon Mechanical Turk in Exercise 1). Visualize the confusion 
matrix for training and testing sets. Calculate precision, recall, the F-measure, 
and AUC, then perform an ROC visualization. How do these classifiers perform? 
Extrapolate code from these models to all un-coded data.

## <font color="red">*Exercise 4*</font>

<font color="red">In the cells immediately following, perform decision tree and 
random forest classification (binary, multinomial or continuous) using training, 
testing and extrapolation (un-coded) data from texts and hand-classifications 
associated with your final project. As with ***Exercise 2***, these could be 
crowdsourced codes gathered through Amazon Mechanical Turk last week. Visualize 
the classification of data points. Calculate relevant metrics (e.g., precision, 
recall, the F-measure, and AUC). Now build an ensemble classifier by bagging trees 
into a random forest. Visualize the result. How do these classifiers perform? 
What does ensemble learning do?

## <font color="red">*Exercise 6*</font>

<font color="red">In the cells immediately following, perform a neural network 
classification and calculate relevant metrics (e.g., precision, recall, the 
F-measure, and AUC). How does this classify relevant to *k*-nearest neighbor, 
logistic and decision-tree approaches?

## <font color="red">*Exercise 7*</font>

<font color="red">In the cells immediately following, use the pipeline functions 
or the word or sentence vector functions (e.g., similarity) to explore the social 
game underlying the production and meaning of texts associated with your final project. 
How does BERT help you gain insight regarding your research question that is similar 
and different from prior methods?