In [3]:
import pymongo
import pprint
import dateparser
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

%matplotlib inline

In [4]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = pymongo.MongoClient(course_cluster_uri)
titanic = course_client['coursera-agg']['titanic']

In [5]:
initial_project = {
    "$project": {
        "_id": 0,
        "name": 0,
        "point_of_embarkation": 0,
        "ticket_number": 0,
        "passenger_id": 0,
        "cabin": 0,
    }
}

In [59]:
for doc in titanic.aggregate([initial_project, {"$limit": 5}]):
    pprint.pprint(doc)

{'age': '',
 'class': 3,
 'fare_paid': 8.05,
 'gender': 'male',
 'parents_children': 0,
 'siblings_spouse': 0,
 'survived': 0}
{'age': '',
 'class': 3,
 'fare_paid': 21.6792,
 'gender': 'male',
 'parents_children': 0,
 'siblings_spouse': 2,
 'survived': 0}
{'age': 54,
 'class': 1,
 'fare_paid': 51.8625,
 'gender': 'male',
 'parents_children': 0,
 'siblings_spouse': 0,
 'survived': 0}
{'age': 21,
 'class': 2,
 'fare_paid': 73.5,
 'gender': 'male',
 'parents_children': 0,
 'siblings_spouse': 0,
 'survived': 0}
{'age': 17,
 'class': 3,
 'fare_paid': 7.925,
 'gender': 'female',
 'parents_children': 2,
 'siblings_spouse': 4,
 'survived': 1}


In [39]:
age_correction = {
    "$switch": {
        "branches": [
            {"case": {"$eq": [{"$type": "$age"}, "string"]}, "then": 0}
        ],
        "default": "$age"
    }
}

In [40]:
one_hot_female = {
    "$switch": {
        "branches": [
            {"case": {"$eq": ["$gender", "female"]}, "then": 1},
            {"case": {"$eq": ["$gender", "male"]}, "then": 0}
        ]
    }
}

In [41]:
one_hot_male = {
    "$switch": {
        "branches": [
            {"case": {"$eq": ["$gender", "female"]}, "then": 0},
            {"case": {"$eq": ["$gender", "male"]}, "then": 1}
        ]
    }
}

In [42]:
encoding_stage = {
    "$addFields": {
        "gender_female": one_hot_female,
        "gender_male": one_hot_male,
        "age": age_correction
    }
}

In [43]:
final_project = {
    "$project": {
        "gender": 0
    }
}

In [44]:
pipeline = [initial_project, encoding_stage, final_project]

In [45]:
df = pd.DataFrame.from_dict(list(titanic.aggregate(pipeline)))
df.head()

Unnamed: 0,age,class,fare_paid,gender_female,gender_male,parents_children,siblings_spouse,survived
0,0.0,3,8.05,0,1,0,0,0
1,0.0,3,21.6792,0,1,0,2,0
2,54.0,1,51.8625,0,1,0,0,0
3,21.0,2,73.5,0,1,0,0,0
4,17.0,3,7.925,1,0,2,4,1


In [46]:
X = df.drop('survived', axis=1)

In [47]:
y = df['survived']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [49]:
dtree = DecisionTreeClassifier()

In [50]:
%%capture
dtree.fit(X_train, y_train)

In [51]:
predictions = dtree.predict(X_test)

In [52]:
print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

[[68 21]
 [ 8 37]]


              precision    recall  f1-score   support

           0       0.89      0.76      0.82        89
           1       0.64      0.82      0.72        45

   micro avg       0.78      0.78      0.78       134
   macro avg       0.77      0.79      0.77       134
weighted avg       0.81      0.78      0.79       134



In [53]:
rfc = RandomForestClassifier(n_estimators=20)

In [54]:
%%capture
rfc.fit(X_train, y_train)

In [55]:
rfc_pred = rfc.predict(X_test)

In [56]:
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred, target_names=['test', 'predictions']))

[[73 16]
 [ 9 36]]


              precision    recall  f1-score   support

        test       0.89      0.82      0.85        89
 predictions       0.69      0.80      0.74        45

   micro avg       0.81      0.81      0.81       134
   macro avg       0.79      0.81      0.80       134
weighted avg       0.82      0.81      0.82       134



In [57]:
iterations = 1000
dtree_avg_accuracy = 0
rfc_avg_accuracy = 0
for _ in range(iterations):
    dtree.fit(X_train, y_train)
    dtree_avg_accuracy += dtree.score(X_test, y_test)
    rfc.fit(X_train, y_train)
    rfc_avg_accuracy += rfc.score(X_test, y_test)
    
print(f"""
After {iterations} iterations:
  Single Decision Tree accuracy: {dtree_avg_accuracy / iterations}
  Random Forest accuracy:        {rfc_avg_accuracy / iterations}
  
  Lab Answer:  dtree={round(dtree_avg_accuracy / iterations, 2)}, rfc={round(rfc_avg_accuracy / iterations, 2)}
""")


After 1000 iterations:
  Single Decision Tree accuracy: 0.7797985074626907
  Random Forest accuracy:        0.8338432835820865
  
  Lab Answer:  dtree=0.78, rfc=0.83



In [58]:
0.78 + 0.83

1.6099999999999999