In [1]:
# Standard library imports
import csv
import json
from pathlib import Path
from typing import Dict, List, Tuple
# Third party imports
import pandas as pd
import pymongo
from pymongo import MongoClient
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
# Local imports
from utils import diff_cols, grid_search, prepare_dataset, pymongo_loads

# File definitions
csv_src = Path("/import/SpeedDatingData.csv")

# Preprocessing inputs
query_cols = {
    "iid": 1,
    "pid": 1,
    "age": 1,
    "age_o": 1,
    "dec": 1,
    "dec_o": 1,
    "attr": 1,
    "attr_o": 1,
    "sinc": 1,
    "sinc_o": 1,
    "intel": 1,
    "intel_o": 1,
    "fun": 1,
    "fun_o": 1,
    "amb": 1,
    "amb_o": 1,
    "shar": 1,
    "shar_o": 1,
    "match": 1,
}
diff_cols_ = ["age", "dec", "attr", "sinc", "intel", "fun", "amb", "shar"]
y_cols = ["match"]

# model inputs
test_size = 0.2
random_state = 777
max_depths = [1, 2, 3, 4, 5, 6, 7, 8]



## Provide outputs from mongosh CLI

1. List databases
    ```
    test> show databases;
    admin         40.00 KiB
    config       108.00 KiB
    local         72.00 KiB
    speeddating   45.16 MiB
    ```
2. Use speeddating database
    ```
    test> use speeddating;
    switched to db speeddating
    speeddating>
    ```
3. Show collections
    ```
    speeddating> show collections;
    events
    ```
4. Show one of the collection documents
    ```
    speeddating> db.events.findOne();
    {
      _id: ObjectId("640e841e5ef008fcacb6f359"),
      iid: '1',
      id: '1',
      gender: '0',
      idg: '1',
      condtn: '1',
      wave: '1',
      round: '10',
      position: '7',
      positin1: '',
      order: '4',
      partner: '1',
      pid: '11',
      match: '0',
      int_corr: '0.14',
      samerace: '0',
      age_o: '27',
      race_o: '2',
      pf_o_att: '35',
      pf_o_sin: '20',
      pf_o_int: '20',
      pf_o_fun: '20',
      pf_o_amb: '0',
      pf_o_sha: '5',
      dec_o: '0',
      attr_o: '6',
      sinc_o: '8',
      intel_o: '8',
      fun_o: '8',
      amb_o: '8',
      shar_o: '6',
      like_o: '7',
      prob_o: '4',
      met_o: '2',
      age: '21',
      field_cd: '1',
      undergra: '',
      mn_sat: '',
      tuition: '',
      race: '4',
      imprace: '2',
      imprelig: '4',
      from: 'Chicago',
      zipcode: '60521',
      income: '69487',
      goal: '2',
      date: '7',
      go_out: '1',
      career: 'lawyer',
      career_c: '',
      sports: '9',
      tvsports: '2',
      exercise: '8',
      dining: '9',
      museums: '1',
      art: '1',
      hiking: '5',
      gaming: '1',
      clubbing: '5',
      reading: '6',
      tv: '9',
      theater: '1',
      movies: '10',
      concerts: '10',
      music: '9',
      shopping: '8',
      yoga: '1',
      exphappy: '3',
      expnum: '2',
      attr1_1: '15',
      sinc1_1: '20',
      intel1_1: '20',
      fun1_1: '15',
      amb1_1: '15',
      shar1_1: '15',
      attr4_1: '',
      sinc4_1: '',
      intel4_1: '',
      fun4_1: '',
      amb4_1: '',
      shar4_1: '',
      attr2_1: '35',
      sinc2_1: '20',
      intel2_1: '15',
      fun2_1: '20',
      amb2_1: '5',
      shar2_1: '5',
      attr3_1: '6',
      sinc3_1: '8',
      fun3_1: '8',
      intel3_1: '8',
      amb3_1: '7',
      attr5_1: '',
      sinc5_1: '',
      intel5_1: '',
      fun5_1: '',
      amb5_1: '',
      dec: '1',
      attr: '6',
      sinc: '9',
      intel: '7',
      fun: '7',
      amb: '6',
      shar: '5',
      like: '7',
      prob: '6',
      met: '2',
      match_es: '4',
      attr1_s: '',
      sinc1_s: '',
      intel1_s: '',
      fun1_s: '',
      amb1_s: '',
      shar1_s: '',
      attr3_s: '',
      sinc3_s: '',
      intel3_s: '',
      fun3_s: '',
      amb3_s: '',
      satis_2: '6',
      length: '2',
      numdat_2: '1',
      attr7_2: '',
      sinc7_2: '',
      intel7_2: '',
      fun7_2: '',
      amb7_2: '',
      shar7_2: '',
      attr1_2: '19.44',
      sinc1_2: '16.67',
      intel1_2: '13.89',
      fun1_2: '22.22',
      amb1_2: '11.11',
      shar1_2: '16.67',
      attr4_2: '',
      sinc4_2: '',
      intel4_2: '',
      fun4_2: '',
      amb4_2: '',
      shar4_2: '',
      attr2_2: '',
      sinc2_2: '',
      intel2_2: '',
      fun2_2: '',
      amb2_2: '',
      shar2_2: '',
      attr3_2: '6',
      sinc3_2: '7',
      intel3_2: '8',
      fun3_2: '7',
      amb3_2: '6',
      attr5_2: '',
      sinc5_2: '',
      intel5_2: '',
      fun5_2: '',
      amb5_2: '',
      you_call: '1',
      them_cal: '1',
      date_3: '0',
      numdat_3: '',
      num_in_3: '',
      attr1_3: '15',
      sinc1_3: '20',
      intel1_3: '20',
      fun1_3: '15',
      amb1_3: '15',
      shar1_3: '15',
      attr7_3: '',
      sinc7_3: '',
      intel7_3: '',
      fun7_3: '',
      amb7_3: '',
      shar7_3: '',
      attr4_3: '',
      sinc4_3: '',
      intel4_3: '',
      fun4_3: '',
      amb4_3: '',
      shar4_3: '',
      attr2_3: '',
      sinc2_3: '',
      intel2_3: '',
      fun2_3: '',
      amb2_3: '',
      shar2_3: '',
      attr3_3: '5',
      sinc3_3: '7',
      intel3_3: '7',
      fun3_3: '7',
      amb3_3: '7',
      attr5_3: '',
      sinc5_3: '',
      intel5_3: '',
      fun5_3: '',
      amb5_3: ''
    }
    ```

## Load data, prepare it, run cross-validation, and test model on test data

In [2]:
print(80 * "~")
print(f"Load data from {csv_src}")
collection = pymongo_loads(csv_src, "speeddating", "events")

print(80 * "~")
print(f"Prepare data for classifier")
df = prepare_dataset(collection, query_cols, diff_cols_)

print(80 * "~")
print("Split into test and train sets")
X_tr, X_te, y_tr, y_te = train_test_split(df.drop(axis=1, labels=y_cols), df[y_cols[0]], random_state=random_state, test_size=test_size)

print(80 * "~")
print("Conduct cross validation")
validation_scores, validation_summary = grid_search(X_tr, y_tr, max_depths)
print("Cross validation results")
print(validation_summary)

print(80 * "~")
print("Select best max depth")
best_max_depth = validation_summary.index.values[validation_summary["acc_val"].argmax()]
print(f"Best max depth={best_max_depth}")

print(80 * "~")
print("Train model on training data using best max depth")
best_model = DecisionTreeClassifier(max_depth=best_max_depth)
best_model.fit(X_tr, y_tr)

print(80 * "~")
print("Score model on test data")
acc_te = round(best_model.score(X_te, y_te), 2)
print(f"Test accuracy={acc_te}")

print(80 * "~")
feature_importances = pd.Series(best_model.feature_importances_, index=list(X_tr)).sort_values(ascending=False)
print(f"Feature importances:\n{feature_importances}")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Load data from /import/SpeedDatingData.csv
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Prepare data for classifier
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Split into test and train sets
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Conduct cross validation
Cross validation results
           acc_tr  acc_val
max_depth                 
1           0.823    0.823
2           0.823    0.823
3           0.823    0.823
4           0.824    0.818
5           0.830    0.814
6           0.839    0.799
7           0.850    0.794
8           0.864    0.797
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Select best max depth
Best max depth=1
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Train model on training data using best max depth
~~~

# Comparison of two databases

The Neo4J database better exploits the relationships inherent in the speed dating dataset. The MongoDB database, which is document-based, merely creates a list of documents. For data that is better expressed as nodes and edges (i.e., a graph), Neo4J is the better option. On the other hand, if I had a collection of documents of varying lengths and metadata, MongoDB would be the better option. A situation where Neo4J wouldn't do as well is one where I have millions of articles of varying length and metadata that I want to store, analyze, and organize.