In [32]:
from pandas.io.json import json_normalize
from pymongo import MongoClient
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pprint

In [33]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = MongoClient(course_cluster_uri)

In [34]:
titanic = course_client['coursera-agg']['titanic']

In [35]:
unique_gender_stage = {
    "$group": {
        "_id": "$gender",
        "count": {"$sum": 1}
    }
}

In [36]:
possible_gender_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_gender_stage
])

In [6]:
pprint.pprint(list(possible_gender_values))

[{'_id': 'female', 'count': 259}, {'_id': 'male', 'count': 453}]


In [7]:
unique_point_of_embarkation_stage = {
    "$group": {
        "_id": "$point_of_embarkation",
        "count": {"$sum": 1}
    }
}

In [8]:
possible_point_of_embarkation_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_point_of_embarkation_stage
])

In [9]:
pprint.pprint(list(possible_point_of_embarkation_values))

[{'_id': 'Q', 'count': 28},
 {'_id': 'C', 'count': 130},
 {'_id': 'S', 'count': 554}]


In [43]:
# convert "gender" and "point_of_embarkation" to integer, just like one-hot encoding
gender_and_point_of_embarkation_conversion_stage = {
    "$project": {
        "passenger_id": 1,
        "survived": 1,
        "class": 1,
        "name": 1,
        "age": 1,
        "siblings_spouse": 1,
        "parents_children": 1,
        "ticket_number": 1,
        "fare_paid": 1,
        "cabin": 1,
        "gender": 
        {
          "$switch":
            {
              "branches": [
                  {"case": {"$eq": ["$gender", "female"]}, "then": 0},
                  {"case": {"$eq": ["$gender", "male"]}, "then": 1}
              ],
              "default": "?"
            }
        },
        "point_of_embarkation":
        {
          "$switch":
            {
              "branches": [
                  {"case": {"$eq": ["$point_of_embarkation", "Q"]}, "then": 0},
                  {"case": {"$eq": ["$point_of_embarkation", "C"]}, "then": 1},
                  {"case": {"$eq": ["$point_of_embarkation", "S"]}, "then": 2}
              ],
              "default": "?"
            }
        }
    }
}

In [44]:
cursor = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    gender_and_point_of_embarkation_conversion_stage,
    {
        "$project": {
            "_id": 0,
            "ticket_number": 0,
            "name": 0,
            "passenger_id": 0,
            "cabin": 0
        }
    }
])

In [45]:
# Exhaust our cursor into a list
titanic_data = list(cursor)

In [46]:
titanic_data[:2]

[{'age': 35,
  'class': 3,
  'fare_paid': 8.05,
  'gender': 1,
  'parents_children': 0,
  'point_of_embarkation': 2,
  'siblings_spouse': 0,
  'survived': 0},
 {'age': 22,
  'class': 3,
  'fare_paid': 7.25,
  'gender': 1,
  'parents_children': 0,
  'point_of_embarkation': 2,
  'siblings_spouse': 1,
  'survived': 0}]

In [48]:
# pandas.io.json.json_normalize() will convert a list of json data into a pandas data frame
df = json_normalize(titanic_data)
df.head()

Unnamed: 0,age,class,fare_paid,gender,parents_children,point_of_embarkation,siblings_spouse,survived
0,35.0,3,8.05,1,0,2,0,0
1,22.0,3,7.25,1,0,2,1,0
2,4.0,3,16.7,0,1,2,1,1
3,27.0,3,11.1333,0,2,2,0,1
4,35.0,1,53.1,0,0,2,1,1


In [49]:
df_x = df.drop(['survived'], axis=1)

In [50]:
df_x.head()

Unnamed: 0,age,class,fare_paid,gender,parents_children,point_of_embarkation,siblings_spouse
0,35.0,3,8.05,1,0,2,0
1,22.0,3,7.25,1,0,2,1
2,4.0,3,16.7,0,1,2,1
3,27.0,3,11.1333,0,2,2,0
4,35.0,1,53.1,0,0,2,1


In [51]:
df_y = df['survived']  # careful, this is a pitfall!

In [52]:
df_y.shape  # the dimension is not correct!

(712,)

__Pitfall__: if you get a dimension like `(134,)`, be careful! For linear regression and some models, this works just fine, but for some other models such as CNN/RNN, this dimension will result in sth unexpected and very hard to debug. As a good habit, you should always check your one-dimensional array and make sure that the 2nd shape parameter is not missing.

In [54]:
df_y.head()

0    0
1    0
2    1
3    1
4    1
Name: survived, dtype: int64

In [55]:
df_y = df.filter(items=['survived'])  # to get the right shape, use filter()

In [56]:
df_y.shape

(712, 1)

In [57]:
df_y.head()

Unnamed: 0,survived
0,0
1,0
2,1
3,1
4,1


In [58]:
reg = linear_model.LinearRegression()

In [59]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)

In [60]:
reg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [65]:
reg.predict(x_test)

array([[ 0.11617702],
       [ 0.87153389],
       [ 0.59414763],
       [ 0.63423417],
       [ 0.01087813],
       [ 0.01750318],
       [ 0.06383078],
       [ 0.62229847],
       [ 0.24885281],
       [ 0.67601157],
       [ 0.99611887],
       [ 0.12202312],
       [ 0.55851268],
       [ 0.02757737],
       [ 0.8365545 ],
       [ 0.46741338],
       [ 0.15605042],
       [ 0.34671959],
       [ 0.32586636],
       [ 0.22807672],
       [ 0.44031932],
       [ 0.856371  ],
       [ 0.62404433],
       [ 0.20277336],
       [ 0.39922451],
       [ 0.80068775],
       [ 0.41696124],
       [ 0.20277451],
       [ 0.16895739],
       [ 0.09294056],
       [ 0.622996  ],
       [-0.03315727],
       [-0.09344652],
       [ 0.64547388],
       [ 0.61809328],
       [ 0.03926457],
       [ 0.09574356],
       [ 0.14527433],
       [ 0.34813906],
       [ 0.56726978],
       [ 0.86719741],
       [ 0.15706532],
       [ 0.65173015],
       [ 0.12202312],
       [ 0.12199556],
       [ 0

In [66]:
mean_squared_error(y_test, reg.predict(x_test))

0.13136657539406468

In [67]:
# age: 25,
# class: 1,
# fare_paid: 45,
# gender: 1 ('male')
# parents_children: 0,
# point_of_embarkation: 1 ('C')
# siblings_spouse: 1

fake_passenger = [[25, 1, 45, 1, 0, 1, 1]]

In [68]:
reg.predict(fake_passenger)

array([[0.47230223]])