In [41]:
import os
import pandas as pd
from google.cloud import bigquery 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
import numpy as np
import pickle


In [42]:
file_path = "./.env/service_account.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_path
project_id = 'fleet-petal-448410-u6'
dataset_id = "titanic_dataset"
raw_table = 'RAW_train_data'

client = bigquery.Client.from_service_account_json(".env/service_account.json")
table = raw_table

print(f"Processing: {table}")
query = f"SELECT * FROM {dataset_id}.{table}"
query_job = client.query(query)
results = query_job.result()
df = results.to_dataframe().drop(columns=["PassengerId",'Cabin','Name'])
print(df.head())
      


Processing: RAW_train_data




  HomePlanet  CryoSleep  Destination  Age    VIP  RoomService  FoodCourt  \
0     Europa       <NA>  55 Cancri e  NaN  False       3478.0       10.0   
1      Earth      False  TRAPPIST-1e  NaN  False          4.0        0.0   
2       Mars      False  TRAPPIST-1e  NaN  False        793.0        0.0   
3     Europa      False  55 Cancri e  NaN  False          0.0     2433.0   
4     Europa      False  55 Cancri e  NaN  False          2.0     1720.0   

   ShoppingMall     Spa  VRDeck  Transported  
0           0.0   105.0  2383.0        False  
1           2.0  4683.0     0.0        False  
2           2.0   253.0     0.0        False  
3           NaN   878.0   443.0         True  
4          12.0  1125.0   122.0         True  


# DATA Processing

In [43]:
#df.isnull().sum()/len(df)

In [44]:
numerical_features = make_column_selector(dtype_include=np.number)
categorial_features = make_column_selector(dtype_exclude=np.number)

In [45]:
numerical_pipeline = make_pipeline(SimpleImputer(strategy= 'median'),StandardScaler())
categorial_pipeline = make_pipeline(SimpleImputer(missing_values=pd.NA,strategy='most_frequent')
                                    ,OneHotEncoder(handle_unknown='ignore',sparse_output=False))


In [46]:
preprocessor = make_column_transformer((numerical_pipeline,numerical_features),
                                     (categorial_pipeline,categorial_features))
preprocessor.set_output(transform="pandas")

In [47]:
X, y = df.drop('Transported', axis=1), df['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
model = make_pipeline(preprocessor,SGDClassifier())
model.fit(X_train,y_train)

In [49]:
preprocessor.transform(X_train)

Unnamed: 0,pipeline-1__Age,pipeline-1__RoomService,pipeline-1__FoodCourt,pipeline-1__ShoppingMall,pipeline-1__Spa,pipeline-1__VRDeck,pipeline-2__HomePlanet_Earth,pipeline-2__HomePlanet_Europa,pipeline-2__HomePlanet_Mars,pipeline-2__CryoSleep_False,pipeline-2__CryoSleep_True,pipeline-2__Destination_55 Cancri e,pipeline-2__Destination_PSO J318.5-22,pipeline-2__Destination_TRAPPIST-1e,pipeline-2__VIP_False,pipeline-2__VIP_True
2333,-0.683230,-0.330169,-0.280200,-0.277107,-0.268585,-0.264765,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2589,-0.613080,-0.330169,-0.280200,-0.277107,-0.268585,-0.264765,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
8302,1.912315,-0.330169,-0.280200,-0.277107,-0.268585,-0.264765,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
8177,1.772015,-0.330169,-0.280200,-0.277107,-0.268585,-0.264765,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
500,-1.805627,-0.330169,-0.280200,-0.277107,-0.268585,-0.264765,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.298868,-0.330169,-0.280200,-0.157953,0.350485,-0.264765,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
5191,0.088419,-0.330169,-0.276457,-0.277107,-0.268585,0.978596,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
5390,0.158568,-0.330169,-0.280200,-0.277107,1.032423,-0.264765,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
860,-1.314578,-0.330169,-0.280200,-0.277107,-0.268585,-0.264765,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [50]:
model.score(X_test,y_test)

0.7797584818861415

In [51]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [52]:
# pusher le model sur GCP Google Cloud Storage
from google.cloud import storage
bucket_name = 'titanic_model_2025_02_07'
storage_client = storage.Client.from_service_account_json(".env/service_account.json")
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob('model.pkl')
blob.upload_from_filename('model.pkl')
print("Model uploaded to GCP")


Model uploaded to GCP
