In [12]:
import os

#to install
import pickle
import pandas as pd
from google.cloud import bigquery 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

import numpy as np


In [2]:
file_path = "./.env/key_sa_titanic_Hugo.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_path
project_id = 'fleet-petal-448410-u6'
dataset_id = "titanic_dataset"
raw_table = 'RAW_train_data'

client = bigquery.Client(project=project_id)
table = raw_table

print(f"Processing: {table}")
query = f"SELECT * FROM {dataset_id}.{table}"
query_job = client.query(query)
results = query_job.result()
df = results.to_dataframe()
print(df.head())






        


Processing: RAW_train_data




  PassengerId HomePlanet  CryoSleep    Cabin  Destination  Age    VIP  \
0     6145_01     Europa       <NA>  C/231/S  55 Cancri e  NaN  False   
1     0052_01      Earth      False    G/6/S  TRAPPIST-1e  NaN  False   
2     0068_01       Mars      False    E/4/S  TRAPPIST-1e  NaN  False   
3     0202_02     Europa      False    A/2/P  55 Cancri e  NaN  False   
4     0206_01     Europa      False    C/9/S  55 Cancri e  NaN  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0       3478.0       10.0           0.0   105.0  2383.0  Benebah Asolipery   
1          4.0        0.0           2.0  4683.0     0.0   Elaney Hubbarton   
2        793.0        0.0           2.0   253.0     0.0        Cinst Binie   
3          0.0     2433.0           NaN   878.0   443.0      Vegas Embleng   
4          2.0     1720.0          12.0  1125.0   122.0    Nuson Brugashed   

   Transported  
0        False  
1        False  
2        False  
3         True  
4      

# DATA Processing

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   boolean
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   boolean
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   boolean
dtypes: boolean(3), float64(6), object(5)
memory usage: 798.1+ KB


In [4]:
df.isnull().sum()/len(df)

PassengerId     0.000000
HomePlanet      0.023122
CryoSleep       0.024963
Cabin           0.022892
Destination     0.020936
Age             0.020591
VIP             0.023352
RoomService     0.020821
FoodCourt       0.021051
ShoppingMall    0.023927
Spa             0.021051
VRDeck          0.021627
Name            0.023007
Transported     0.000000
dtype: float64

In [5]:
del df['PassengerId'], df['Name'], df['Cabin']

In [6]:
X, y = df.drop('Transported', axis=1), df['Transported']

In [7]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)


categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_mode",
            SimpleImputer(missing_values=pd.NA,fill_value="missing", strategy="most_frequent"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="error")),
    ]
)


num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(include=['object', 'boolean']).columns

preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, cat_cols),
        ("numerical", numeric_preprocessor, num_cols),
    ]
)


pipe = make_pipeline(preprocessor, LogisticRegression(random_state=42, max_iter=500))
pipe

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Fit the pipeline to your training data
pipe.fit(X_train, y_train)

# Make predictions on the test set
predictions = pipe.predict(X_test)

In [14]:
pipe.score(X_test, y_test)

0.7838086476540939

In [None]:
with open('model_pipeline_HD.pkl', 'wb') as f:
    pickle.dump(pipe, f)