<a href="https://colab.research.google.com/github/niczky12/medium/blob/master/tech/bigquery/ML_with_Google_BigQuery_Kaggle_Titanic_end_to_end.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# run this if you need to add your kaggle keys
# make sure to choose the kaggle.json file that you received from kaggle
from google.colab import files
files.upload()

! mkdir -p ~/.kaggle
! mv kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
# check if your creds are ok
! kaggle datasets list

ref                                                         title                                              size  lastUpdated          downloadCount  
----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  
gpreda/reddit-vaccine-myths                                 Reddit Vaccine Myths                              237KB  2021-12-12 11:59:54          18602  
crowww/a-large-scale-fish-dataset                           A Large Scale Fish Dataset                          3GB  2021-04-28 17:03:01          11275  
imsparsh/musicnet-dataset                                   MusicNet Dataset                                   22GB  2021-02-18 14:12:19           5785  
dhruvildave/wikibooks-dataset                               Wikibooks Dataset                                   2GB  2021-10-22 10:48:21           3965  
nickuzmenkov/nih-chest-xrays-tfrecords                      NIH Chest X-rays

In [None]:
from google.colab import auth
from google.cloud import bigquery
from google.cloud.bigquery import magics
import os
import matplotlib.pyplot as plt
import kaggle
import pandas as pd
import numpy as np


plt.rcParams["figure.figsize"] = [14, 10]

In [None]:
# authenticate to Google Cloud
auth.authenticate_user()

In [None]:
PROJECT_ID = "YOURPROJECTID"
DATASET_ID = "ds"
magics.context.project = PROJECT_ID

In [None]:
!kaggle competitions download -c titanic

Downloading train.csv to /content
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 22.3MB/s]
Downloading gender_submission.csv to /content
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 2.75MB/s]
Downloading test.csv to /content
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 26.8MB/s]


In [None]:
# uploading datasets from pandas
# but you can also do this by sending the files to GCS and then loading them from there
# ask in the comments if you're interested
df = pd.read_csv("train.csv")

df.to_gbq("ds.titanic_raw", project_id=PROJECT_ID, if_exists="replace")

In [None]:
%%bigquery
select *
from ds.titanic_raw

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0000,,S
1,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0000,B94,S
2,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0000,,S
3,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0000,,S
4,414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,456,1,3,"Jalsevac, Mr. Ivan",male,29.0,0,0,349240,7.8958,,C
887,497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54.0,1,0,36947,78.2667,D20,C
888,592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52.0,1,0,36947,78.2667,D20,C
889,292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19.0,1,0,11967,91.0792,B49,C


In [None]:
# do the same for test
pd.read_csv("test.csv").to_gbq("ds.titanic_test", project_id=PROJECT_ID, if_exists="replace")

1it [00:03,  3.20s/it]


In [None]:
# we will split the data into train and eval sets
# to do this, we'll add a random ordering column and then take the first 80% of ids
# per group into the train group and 20% into the eval group

In [None]:
%%bigquery
with example_users as (
select
  123 as passengerID
union all
select
  123 as passengerID
union all
select
  456 as passengerID
)


select
  passengerID
  ,FARM_FINGERPRINT(cast(passengerID as string)) as random_hash
  ,FARM_FINGERPRINT(cast(passengerID as string) || "mario") as seeded_hash
from example_users

Unnamed: 0,passengerID,random_hash,seeded_hash
0,123,-3222588021317909685,-4285693159396640487
1,123,-3222588021317909685,-4285693159396640487
2,456,-7152823871777768794,9068623215687284528


In [None]:
%%bigquery
with hashes as (
select
  FARM_FINGERPRINT(cast(passengerid as string)) as farmhash
  ,*
from ds.titanic_raw
), percentiles as (
  select
    *
    ,PERCENTILE_CONT(farmhash, 0.8) over(partition by survived) as cutoff
  from hashes
), splitdata as (
  select
    * except(cutoff, farmhash)
    ,case when farmhash < cutoff then 1 else 0 end as isTrain
  from percentiles
)

select isTrain, survived, count(*) as counts
from splitdata
group by isTrain, survived
order by isTrain, survived

Unnamed: 0,isTrain,survived,counts
0,0,0,110
1,0,1,69
2,1,0,439
3,1,1,273


In [None]:
%%bigquery
create or replace table ds.titanic_prepped
partition by RANGE_BUCKET(isTrain, generate_array(0, 1))
as (
  select
    * except(cutoff, farmhash)
    ,case when farmhash < cutoff then 1 else 0 end as isTrain
  from (
    select
      *
      ,PERCENTILE_CONT(farmhash, 0.8) over(partition by survived) as cutoff
    from (
      select
        FARM_FINGERPRINT(cast(passengerid as string)) as farmhash
        ,*
      from ds.titanic_raw
      )
  )
)

In [None]:
# try running this in the console to see what the queried dataset size is like!
# SELECT * FROM `ds.titanic_prepped` where istrain = 0
# SELECT * FROM `ds.titanic_prepped`

In [None]:
%%bigquery
CREATE OR REPLACE MODEL `ds.titanic_baseline`
OPTIONS(
    MODEL_TYPE='LOGISTIC_REG',
    INPUT_LABEL_COLS=['Survived'],
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='isTrain',
    AUTO_CLASS_WEIGHTS=TRUE,
    EARLY_STOP=TRUE,
    L2_REG = 0.3
) AS
SELECT
    * except(passengerid, isTrain)
    -- we need a boolean column for splitting
    ,isTrain = 1 as isTrain
FROM ds.titanic_prepped

In [None]:
%%bigquery
SELECT *
FROM
  ML.PREDICT(
    -- first part is the model we use to predict with
    MODEL `ds.titanic_baseline`,
    -- second argument is the dataset to predict on
    (SELECT * FROM ds.titanic_test)
    )
-- cross join explodes our array of predictions so that we have one row per class
CROSS JOIN UNNEST(predicted_Survived_probs) AS preds
-- we filter for the positive predictions to get 1 row per passenger
WHERE preds.label = 1

Unnamed: 0,predicted_Survived,predicted_Survived_probs,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,label,prob
0,0,"[{'label': 1, 'prob': 0.28023416662524436}, {'...",1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0.0000,,S,1,0.280234
1,0,"[{'label': 1, 'prob': 0.21269069477885177}, {'...",1264,1,"Ismay, Mr. Joseph Bruce",male,49.0,0,0,112058,0.0000,B52 B54 B56,S,1,0.212691
2,0,"[{'label': 1, 'prob': 0.2076696884802004}, {'l...",903,1,"Jones, Mr. Charles Cresson",male,46.0,0,0,694,26.0000,,S,1,0.207670
3,0,"[{'label': 1, 'prob': 0.19385832253005314}, {'...",974,1,"Case, Mr. Howard Brown",male,49.0,0,0,19924,26.0000,,S,1,0.193858
4,0,"[{'label': 1, 'prob': 0.3972245598340023}, {'l...",986,1,"Birnbaum, Mr. Jakob",male,25.0,0,0,13905,26.0000,,C,1,0.397225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,"[{'label': 1, 'prob': 0.1797377047388838}, {'l...",1043,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C,1,0.179738
414,0,"[{'label': 1, 'prob': 0.1504774925717819}, {'l...",1101,3,"Delalic, Mr. Redjo",male,25.0,0,0,349250,7.8958,,S,1,0.150477
415,0,"[{'label': 1, 'prob': 0.1373595648010641}, {'l...",1157,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S,1,0.137360
416,0,"[{'label': 1, 'prob': 0.1468452436569288}, {'l...",1187,3,"Angheloff, Mr. Minko",male,26.0,0,0,349202,7.8958,,S,1,0.146845


In [None]:
cl = bigquery.Client(project=PROJECT_ID)
preds_baseline = cl.query("""
WITH
scores AS (
  SELECT *
  FROM
    ML.PREDICT(
      -- first part is the model we use to predict with
      MODEL `ds.titanic_baseline`,
      -- second argument is the dataset to predict on
      (SELECT * FROM ds.titanic_test)
      )
  -- cross join explodes our array of predictions so that we have one row per class
  CROSS JOIN UNNEST(predicted_Survived_probs) AS preds
  -- we filter for the positive predictions to get 1 row per passenger
  WHERE preds.label = 1
)

SELECT
  PassengerId
  ,case when prob > 0.4 then 1 else 0 end as Survived
FROM scores
""").to_dataframe()



In [None]:
preds_baseline

Unnamed: 0,PassengerId,Survived
0,1158,0
1,1264,0
2,903,0
3,974,0
4,986,0
...,...,...
413,1043,0
414,1101,0
415,1157,0
416,1187,0


In [None]:
preds_baseline.to_csv("baseline.csv", index=False)

# this gives us a 0.75 on the public leaderboard
!kaggle competitions submit -c titanic -m "baseline BQ" -f baseline.csv

100% 2.77k/2.77k [00:01<00:00, 2.42kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

# Feature engineering

In [None]:
%%bigquery
CREATE OR REPLACE MODEL `ds.titanic_features`
TRANSFORM (
  ML.QUANTILE_BUCKETIZE(Age, 10) OVER() as age_buckets
  ,ML.QUANTILE_BUCKETIZE(Fare, 20) OVER() as fare_buckets
  ,ML.FEATURE_CROSS(STRUCT(cast(age as string) AS age, cast(pclass as string) AS class)) AS cross_age_class
  ,pclass
  ,name
  ,sex
  ,Ml.standard_scaler(age) OVER() as age
  ,SibSp
  ,Parch
  ,Ticket
  ,Ml.standard_scaler(Fare) OVER() as fare
  ,Cabin
  ,Embarked
  ,survived
  ,isTrain
)
OPTIONS(
    MODEL_TYPE='LOGISTIC_REG',
    INPUT_LABEL_COLS=['Survived'],
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='isTrain',
    AUTO_CLASS_WEIGHTS=TRUE,
    EARLY_STOP=TRUE,
    L2_REG = 0.3
) AS
SELECT
    * except(passengerid, isTrain)
    -- we need a boolean column for splitting
    ,isTrain = 1 as isTrain
FROM ds.titanic_prepped

In [None]:
%%bigquery
CREATE OR REPLACE MODEL `ds.titanic_boost`
TRANSFORM (
  ML.QUANTILE_BUCKETIZE(Age, 10) OVER() as age_buckets
  ,ML.QUANTILE_BUCKETIZE(Fare, 20) OVER() as fare_buckets
  ,ML.FEATURE_CROSS(STRUCT(cast(age as string) AS age, cast(pclass as string) AS class)) AS cross_age_class
  ,pclass
  ,name
  ,sex
  ,Ml.standard_scaler(age) OVER() as age
  ,SibSp
  ,Parch
  ,Ticket
  ,Ml.standard_scaler(Fare) OVER() as fare
  ,Cabin
  ,Embarked
  ,survived
  ,isTrain
)
OPTIONS(
    MODEL_TYPE='BOOSTED_TREE_CLASSIFIER',
    INPUT_LABEL_COLS=['Survived'],
    DATA_SPLIT_METHOD='CUSTOM',
    DATA_SPLIT_COL='isTrain',
    AUTO_CLASS_WEIGHTS=TRUE,
    EARLY_STOP=TRUE,
    NUM_PARALLEL_TREE = 100,
    MAX_TREE_DEPTH = 3,
    SUBSAMPLE = 0.8
) AS
SELECT
    * except(passengerid, isTrain)
    -- we need a boolean column for splitting
    ,isTrain = 1 as isTrain
FROM ds.titanic_prepped

In [None]:
%%bigquery
select *
from ML.EVALUATE(MODEL `ds.titanic_boost`)

Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.785088,0.655678,0.799157,0.714571,0.452569,0.834372


In [None]:
cl = bigquery.Client(project=PROJECT_ID)
preds_boost = cl.query("""
WITH
scores AS (
SELECT
  *
FROM
  ML.PREDICT(
    MODEL `ds.titanic_boost`,
    (
      SELECT * FROM ds.titanic_test
    ))
cross join unnest(predicted_Survived_probs) as label
)

SELECT
  PassengerId
  ,case when prob > 0.3807 then 1 else 0 end as Survived
FROM scores
where label = 1""").to_dataframe()

In [None]:
# this gives us a 0.75 on the public leaderboard
preds_boost.to_csv("boost.csv", index=False)

!kaggle competitions submit -c titanic -m "boost BQ" -f boost.csv

100% 2.77k/2.77k [00:00<00:00, 6.84kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster