In [15]:
import pandas as pd
!pip install joblib
import joblib
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , StandardScaler



#**Prepare the testing dataset for the competiton**

In [16]:
#load the testing set
test_filepath= '/content/drive/MyDrive/ML-AI projects/titanic survival prediction/Titanic_survival_prediction/dataset/test.csv'
df= pd.read_csv(test_filepath)
passId= df['PassengerId']
df=df.drop(['Name','Ticket','Cabin','PassengerId',],axis=1)
print(df.describe())

#check for missing values
missing_values=df.isna().sum()
print("the number of missing values per feature is \n",missing_values)

#handle the missing values
features_to_impute=['Age','Fare']
imputer=SimpleImputer(missing_values=np.nan , strategy= 'mean' )
df[features_to_impute]= imputer.fit_transform(df[features_to_impute])
missing_values=df.isna().sum()
print("the number of missing values per feature after imputation is \n",missing_values)

#encoding the categorical values
categorical_features= df.select_dtypes(include=['object']).columns.tolist()
print("the categorical features are ",categorical_features)
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),categorical_features)],remainder='passthrough')
df_encoded= ct.fit_transform(df) # nb the encoder returns an array and not a dataframe

#get new column names
encoded_columns= ct.transformers_[0][1].get_feature_names_out(categorical_features)

#combine all column names
all_columns= list(encoded_columns)+ list(df.columns.drop(categorical_features))

#turn the array back to a dataframe
df_encoded= pd.DataFrame(df_encoded , columns= all_columns)
print(df_encoded.shape)
print(df_encoded.describe())

#apply feature scaling
features_to_scale = ['Age', 'Fare']
sc= StandardScaler()
df_encoded[features_to_scale] = sc.fit_transform(df_encoded[features_to_scale])
df_encoded.to_csv('/content/drive/MyDrive/ML-AI projects/titanic survival prediction/Titanic_survival_prediction/competition/test_dataset_encoded_and_scaled.csv',index=False)

#load the model
model_path='/content/drive/MyDrive/ML-AI projects/titanic survival prediction/Titanic_survival_prediction/training/best_lgb_model.pkl'
model = joblib.load(model_path)

#get predictions
predictions = model.predict(df_encoded)
print("the predictions made by the model are ",predictions)

#make a csv comprising of passenger id and prediction
results_df = pd.DataFrame({'PassengerId': passId, 'Survived': predictions})
results_df.to_csv('/content/drive/MyDrive/ML-AI projects/titanic survival prediction/Titanic_survival_prediction/competition/predictions.csv', index=False)




           Pclass         Age       SibSp       Parch        Fare
count  418.000000  332.000000  418.000000  418.000000  417.000000
mean     2.265550   30.272590    0.447368    0.392344   35.627188
std      0.841838   14.181209    0.896760    0.981429   55.907576
min      1.000000    0.170000    0.000000    0.000000    0.000000
25%      1.000000   21.000000    0.000000    0.000000    7.895800
50%      3.000000   27.000000    0.000000    0.000000   14.454200
75%      3.000000   39.000000    1.000000    0.000000   31.500000
max      3.000000   76.000000    8.000000    9.000000  512.329200
the number of missing values per feature is 
 Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64
the number of missing values per feature after imputation is 
 Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
the categorical features are  ['Sex', 'Embarked']
(418, 10)
       S

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



the predictions made by the model are  [0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 1 0 1 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0
 1 1 0 1 0 1 1 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
