##  Step 1: Load and Explore the Dataset

In [23]:
import pandas as pd

# Load the Titanic dataset (assuming it's available as 'titanic.csv')
data = pd.read_csv('tested.csv')

# Drop irrelevant or high-cardinality columns
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Display the first few rows of the dataset
print(data.head())

# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values in the Dataset:")
print(missing_values)


   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  34.5      0      0   7.8292        Q
1         1       3  female  47.0      1      0   7.0000        S
2         0       2    male  62.0      0      0   9.6875        Q
3         0       3    male  27.0      0      0   8.6625        S
4         1       3  female  22.0      1      1  12.2875        S
Missing Values in the Dataset:
Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


##  Step 2: Data Preprocessing and Feature Engineering

In [24]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Handling missing values
# Fill missing values in 'Age' with the median age
# Fill missing values in 'Fare' with the median fare
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

# Separate features and target variable
X = data.drop('Survived', axis=1)  # Features
y = data['Survived']  # Target variable

# Encoding categorical variables
X['Sex'] = LabelEncoder().fit_transform(X['Sex'])  # Encode 'Sex' as numerical
X = pd.get_dummies(X, columns=['Embarked'], drop_first=True)  # One-hot encode 'Embarked'


## Step 3: Data Splitting

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Step 4: Choose a Classification Algorithm

In [26]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
clf = LogisticRegression()

# Train the classifier on the training data
clf.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


##  Step 4: Model Evaluation

In [27]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report with precision, recall, and F1-score
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Calculate and display the confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)


Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

Confusion Matrix:
[[50  0]
 [ 0 34]]
