#**importing necessary libraries**

In [62]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder ,LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#**Importing the dataset**

In [108]:
df_filepath='/content/drive/MyDrive/ML-AI projects/titanic survival prediction/Titanic_survival_prediction/dataset/train.csv'
df= pd.read_csv(df_filepath)

print(df.dtypes,"\n")
X=df.drop('Survived',axis=1)
y=df['Survived']

print(df.describe())
print("\n","the number of unique values per feature is \n",df.nunique())

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object 

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.38

#**Missing data**

In [109]:
missing_count= df.isna().sum()
print("missing values per feature are \n",missing_count)

X=X.drop('Cabin',axis=1)

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
features_to_impute=['Age']
X[features_to_impute]=imputer.fit_transform(X[features_to_impute])


imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
features_to_impute=['Embarked']
X[features_to_impute]=imputer.fit_transform(X[features_to_impute])
missing_count= X.isna().sum()
print("missing values per feature after imputation are \n",missing_count)

X=X.drop('PassengerId',axis=1)
X=X.drop('Name',axis=1)
X=X.drop('Ticket',axis=1)


missing values per feature are 
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
missing values per feature after imputation are 
 PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


#**Encoding categorical data**

In [127]:

# Identify categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
print("Categorical features:", categorical_features)

# Apply OneHotEncoder to the categorical features
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')
X_encoded = ct.fit_transform(X)

# Get the column names for the encoded features
encoded_columns = ct.transformers_[0][1].get_feature_names_out(categorical_features)

# Combine encoded columns and the remaining non-categorical features
# Note: 'remainder' will keep non-categorical features as they are
all_columns = list(encoded_columns) + [col for col in X.columns if col not in categorical_features]

# Convert the result to a DataFrame with proper column names
X_encoded = pd.DataFrame(X_encoded, columns=all_columns)

print(X_encoded.shape)
print(X_encoded.describe())



Categorical features: ['Sex', 'Embarked']
(891, 10)
       Sex_female    Sex_male  Embarked_C  Embarked_Q  Embarked_S      Pclass  \
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000   
mean     0.352413    0.647587    0.188552    0.086420    0.725028    2.308642   
std      0.477990    0.477990    0.391372    0.281141    0.446751    0.836071   
min      0.000000    0.000000    0.000000    0.000000    0.000000    1.000000   
25%      0.000000    0.000000    0.000000    0.000000    0.000000    2.000000   
50%      0.000000    1.000000    0.000000    0.000000    1.000000    3.000000   
75%      1.000000    1.000000    0.000000    0.000000    1.000000    3.000000   
max      1.000000    1.000000    1.000000    1.000000    1.000000    3.000000   

              Age       SibSp       Parch        Fare  
count  891.000000  891.000000  891.000000  891.000000  
mean    29.699118    0.523008    0.381594   32.204208  
std     13.002015    1.102743    0.806057   49.693

#**Splitting the dataset**

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size = 0.2, random_state = 42)
# Convert NumPy arrays back to DataFrames
X_train = pd.DataFrame(X_train, columns=X_encoded.columns)
X_test = pd.DataFrame(X_test, columns=X_encoded.columns)
y_train = pd.DataFrame(y_train, columns=['Survived'])  # Replace 'Target' with your actual target column name
y_test = pd.DataFrame(y_test, columns=['Survived'])  # Replace 'Target' with your actual target column name



#**Feature scaling**

In [130]:
print(X_train.describe())
features_to_scale=['Age',"Fare"]

sc=StandardScaler()

X_train[features_to_scale]=sc.fit_transform(X_train[features_to_scale])
X_test[features_to_scale]=sc.transform(X_test[features_to_scale])



       Sex_female    Sex_male  Embarked_C  Embarked_Q  Embarked_S      Pclass  \
count  712.000000  712.000000  712.000000  712.000000  712.000000  712.000000   
mean     0.344101    0.655899    0.175562    0.084270    0.740169    2.330056   
std      0.475408    0.475408    0.380714    0.277987    0.438850    0.824584   
min      0.000000    0.000000    0.000000    0.000000    0.000000    1.000000   
25%      0.000000    0.000000    0.000000    0.000000    0.000000    2.000000   
50%      0.000000    1.000000    0.000000    0.000000    1.000000    3.000000   
75%      1.000000    1.000000    0.000000    0.000000    1.000000    3.000000   
max      1.000000    1.000000    1.000000    1.000000    1.000000    3.000000   

              Age       SibSp       Parch        Fare  
count  712.000000  712.000000  712.000000  712.000000  
mean    29.538225    0.553371    0.379213   32.586276  
std     12.994548    1.176404    0.791669   51.969529  
min      0.420000    0.000000    0.000000    0

#**Save preprocessed dataset**

In [131]:
folder_path='/content/drive/MyDrive/ML-AI projects/titanic survival prediction/Titanic_survival_prediction/preprocessing'
X_train_path=folder_path+'/X_train.csv'
y_train_path=folder_path+'/y_train.csv'
X_test_path=folder_path+'/X_test.csv'
y_test_path=folder_path+'/y_test.csv'

X_train.to_csv(X_train_path,index=False)
y_train.to_csv(y_train_path,index=False)
X_test.to_csv(X_test_path,index=False)
y_test.to_csv(y_test_path,index=False)