In [None]:
# ! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
%pip install pydantic

pandas_profiling только с версией из гит и на версии питона <3.11

In [None]:
import pandas as pd
import pydantic
pydantic.__version__
from pandas_profiling import ProfileReport

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
profile = ProfileReport(df, title='Spaceship Titanic')

In [None]:
profile.to_notebook_iframe()

In [None]:
df['Deck']=df["Cabin"].apply(lambda x: str(x).split('/')[0])

In [None]:
def split_cabin(x):
  if len(str(x).split('/')) < 3:
    return ['Missing', 'Missing', "Missing"]
  else:   
    return str(x).split('/')

In [None]:
# create a predprocessing function to transform database
def preprocessing(df):
    # Fill missing values in homeplanet with missing 
    df['HomePlanet'].fillna('Missing', inplace=True)
    # Cryosleep - highly correlated - drop na rows
    df['CryoSleep'].fillna('Missing', inplace=True)
    # Cabin preprocessing - extract Deck and Side 
    df['TempCabin'] = df['Cabin'].apply(lambda x: split_cabin(x))
    df['Deck'] = df['TempCabin'].apply(lambda x: x[0])
    df['Side'] = df['TempCabin'].apply(lambda x: x[2])
    df.drop(['TempCabin', 'Cabin'], axis=1, inplace=True) 
    df['Destination'].fillna('Missing', inplace=True)
    # Age 
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    # VIP - drop na rows
    df['VIP'].fillna('Missing', inplace=True)
    # Monetary spending columns 
    df['RoomService'].fillna(0, inplace=True)
    df['FoodCourt'].fillna(0, inplace=True) 
    df['ShoppingMall'].fillna(0, inplace=True)
    df['Spa'].fillna(0, inplace=True)
    df['VRDeck'].fillna(0, inplace=True)
    # Drop name due to high cardinality
    df.drop('Name', axis=1, inplace=True)
    # Drop remaining rows
    #df.dropna(inplace=True)

In [None]:
abt = df.copy()
preprocessing(abt)
abt.head()

In [None]:
abt.info()

### Modelling
- Feature and Target values - X, y
- One hot encode any categorical features
- Train, holdout split
- Train on a bunch of algos

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt 
import seaborn as sns

In [None]:
X = abt.drop(['Transported', 'PassengerId'], axis=1)
X = pd.get_dummies(X)
y = abt['Transported']

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=1234)

In [None]:
X_train.head()

### Setup ML Pipelines

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
pipelines = {
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234))
}

In [None]:
GradientBoostingClassifier().get_params()

In [None]:
grid = {
    'rf': {
        'randomforestclassifier__n_estimators':[100,200,300]
    },
    'gb':{
        'gradientboostingclassifier__n_estimators':[100,200,300]
    } 
}
     

In [None]:
pipelines.items()

In [None]:
# Create a blank dictionary to hold the models 
fit_models = {}
# Loop through all the algos 
for algo, pipeline in pipelines.items():
  print(f'Training the {algo} model.')
  # Create new Grid Search CV Cclass 
  model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10)
  # Train the model 
  model.fit(X_train, y_train)
  # Store results inside of the dictionary
  fit_models[algo] = model 

### Evaluate Performance on Test Partition

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
for alg, model in fit_models.items(): 
  yhat = model.predict(X_test)
  accuracy = accuracy_score(y_test, yhat)
  precision = precision_score(y_test, yhat)
  recall = recall_score(y_test, yhat)
  print(f'Metrics for {alg}: accuracy- {accuracy}, recall- {recall}, precision- {precision}')

In [None]:
import pickle

In [None]:
with open('gradientboosted.pkl', 'wb') as f: 
  pickle.dump(fit_models['gb'], f)

In [None]:
with open('gradientboosted.pkl', 'rb') as f: 
  reloaded_model = pickle.load(f)

### Predict on Test Data

In [None]:
# Read in the Test CSV Dataset
test_df = pd.read_csv('test.csv')
# Deep copy
abt_test = test_df.copy()
# Run through the preocessing pipeline
preprocessing(abt_test)
# One hot encode categorical variables
abt_test = pd.get_dummies(abt_test.drop('PassengerId', axis=1))

In [None]:
abt_test.head()

In [None]:
yhat_test = fit_models['gb'].predict(abt_test)

In [None]:
submission = pd.DataFrame([test_df['PassengerId'], yhat_test]).T
submission.columns = ['PassengerID', 'Transported']

In [None]:
submission.head()

### Submit to Kaggle

In [None]:
# index=False, чтобы не писался номер слева таблицы
submission.to_csv('kaggle_submission.csv', index=False)