In [None]:
# Import python packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score


from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
df = session.table('TITANIC_TRAIN').to_pandas()

print(df.head())



In [None]:
# Split train/test set
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Feature groups
num_features = [x.upper() for x in ['Age', 'Fare']]
cat_features = [x.upper() for x in ['Sex', 'Embarked', 'Pclass']]

# Transformers
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Column Transformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [None]:
X_train = train_data[num_features + cat_features]
y_train = train_data['SURVIVED']

# Pipeline
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# 5-fold cross-validation on training set
scores = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print("Logistic Regression CV Accuracy:", scores.mean())

In [None]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print("Random Forest CV Accuracy:", scores.mean())

In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [3, 5],
    'classifier__learning_rate': [0.05, 0.1]
}

grid_search = GridSearchCV(
    xgb_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=1,
    verbose=1
)

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

In [None]:
test_df = session.table('TITANIC_TEST').to_pandas()

print(test_df.head())

In [None]:
X_submission = test_df[num_features + cat_features]
# best_model = grid_search.best_estimator_

best_model = rf_pipeline
submission_preds = best_model.predict(X_submission)


submission = pd.DataFrame({
    "PassengerId": test_df["PASSENGERID"],
    "Survived": submission_preds
})

In [None]:
session.sql("SELECT CURRENT_ROLE(), CURRENT_DATABASE(), CURRENT_SCHEMA()").show()

In [None]:
session.sql("USE ROLE JR_MLE").collect()

In [None]:
session.write_pandas(submission, table_name='TEST_SUBMISSION', overwrite=True)