In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.exceptions import NotFittedError


In [None]:
data = pd.read_csv('/content/archive (3).zip', encoding='latin1')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
#Data exploration and preprocessing
print(data.head())

   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  


In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB
None


In [None]:
print(data.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   418.000000  418.000000  418.000000  332.000000  418.000000   
mean   1100.500000    0.363636    2.265550   30.272590    0.447368   
std     120.810458    0.481622    0.841838   14.181209    0.896760   
min     892.000000    0.000000    1.000000    0.170000    0.000000   
25%     996.250000    0.000000    1.000000   21.000000    0.000000   
50%    1100.500000    0.000000    3.000000   27.000000    0.000000   
75%    1204.750000    1.000000    3.000000   39.000000    1.000000   
max    1309.000000    1.000000    3.000000   76.000000    8.000000   

            Parch        Fare  
count  418.000000  417.000000  
mean     0.392344   35.627188  
std      0.981429   55.907576  
min      0.000000    0.000000  
25%      0.000000    7.895800  
50%      0.000000   14.454200  
75%      0.000000   31.500000  
max      9.000000  512.329200  


In [None]:
# Handle missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

In [None]:

# Verify no NaNs remain in the dataset
print("\nChecking for NaNs in the dataset after filling missing values:")
print(data.isnull().sum())


Checking for NaNs in the dataset after filling missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [None]:
# Drop unnecessary columns
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [None]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

In [None]:
# Verify no NaNs remain after encoding
print("\nChecking for NaNs after encoding categorical variables:")
print(data.isnull().sum())


Checking for NaNs after encoding categorical variables:
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64


In [None]:
# Feature and target variable
X = data.drop('Survived', axis=1)
y = data['Survived']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Check for NaNs in the train and test sets
print("\nChecking for NaNs in X_train and X_test:")
print("X_train NaNs:", pd.DataFrame(X_train).isnull().sum())
print("X_test NaNs:", pd.DataFrame(X_test).isnull().sum())


Checking for NaNs in X_train and X_test:
X_train NaNs: PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64
X_test NaNs: PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [None]:
# Ensure no NaNs before fitting
print("\nChecking for NaNs in X_train before fitting:")
print(np.isnan(X_train).sum())
print("Checking for NaNs in X_test before fitting:")
print(np.isnan(X_test).sum())


Checking for NaNs in X_train before fitting:
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64
Checking for NaNs in X_test before fitting:
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [None]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Double-check for NaNs after scaling
print("\nChecking for NaNs in X_train and X_test after scaling:")
print("X_train NaNs:", np.isnan(X_train).sum())
print("X_test NaNs:", np.isnan(X_test).sum())


Checking for NaNs in X_train and X_test after scaling:
X_train NaNs: 1
X_test NaNs: 0


In [None]:
# Identify rows with NaNs in X_train
if np.isnan(X_train).sum() > 0:
    print("Rows in X_train with NaNs:")
    print(np.argwhere(np.isnan(X_train)))

Rows in X_train with NaNs:
[[48  6]]


In [None]:
# Identify rows with NaNs in X_test
if np.isnan(X_test).sum() > 0:
    print("Rows in X_test with NaNs:")
    print(np.argwhere(np.isnan(X_test)))

In [None]:
# Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Ensure no NaNs before fitting
print("\nFinal check for NaNs in X_train before fitting:")
print(np.isnan(X_train).sum())
print("Final check for NaNs in X_test before fitting:")
print(np.isnan(X_test).sum())


Final check for NaNs in X_train before fitting:
1
Final check for NaNs in X_test before fitting:
0


In [None]:

# Check the shapes of X_train and y_train
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")

Shape of X_train: (334, 8)
Shape of y_train: (334,)


In [None]:
# Check the balance of the target variable
print("Class distribution in y_train:")
print(y_train.value_counts())

Class distribution in y_train:
Survived
0    216
1    118
Name: count, dtype: int64


In [None]:
# Fit the model
try:
    model.fit(X_train, y_train)
    print("Model fitting successful!")
except ValueError as e:
    print(f"Error in model fitting: {e}")
except Exception as e:
    print(f"Unexpected error in model fitting: {e}")

Error in model fitting: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [None]:
try:
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'\nAccuracy: {accuracy}')
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

    # Feature importance
    feature_importances = model.feature_importances_
    features = X.columns
    importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importance')
    plt.show()
except NotFittedError as e:
    print(f"Model is not fitted: {e}")
except ValueError as e:
    print(f"Error in model prediction: {e}")
except Exception as e:
    print(f"Unexpected error in model prediction: {e}")

Model is not fitted: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
