In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import plot_tree,DecisionTreeClassifier
from sklearn.svm import SVC

In [8]:
df = sns.load_dataset("titanic")

In [9]:
df.drop_duplicates(inplace=True)
df.drop("deck",inplace=True,axis=1)
categorical= df.select_dtypes(include=["object","category"]).columns
numerical= df.select_dtypes(include=["float64","int64"]).columns
df[numerical] = df[numerical].fillna(df[numerical].mean())
df[categorical] = df[categorical].fillna(df[categorical].mode().iloc[0])

In [10]:
df = sns.load_dataset("titanic")

In [11]:
df.drop_duplicates(inplace=True)
df.drop("deck",inplace=True,axis=1)
categorical= df.select_dtypes(include=["object","category"]).columns
numerical= df.select_dtypes(include=["float64","int64"]).columns
df[numerical] = df[numerical].fillna(df[numerical].mean())
df[categorical] = df[categorical].fillna(df[categorical].mode().iloc[0])

In [12]:
df = sns.load_dataset("titanic")

In [13]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [14]:
df.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [15]:
df.duplicated().sum()

np.int64(107)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df.drop("deck",inplace=True,axis=1)

In [18]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [19]:
categorical= df.select_dtypes(include=["object","category"]).columns
numerical= df.select_dtypes(include=["float64","int64"]).columns

In [20]:
df[numerical] = df[numerical].fillna(df[numerical].mean())

In [21]:
df[categorical] = df[categorical].fillna(df[categorical].mode().iloc[0])

In [22]:
df.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,0
class,0
who,0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 784 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     784 non-null    int64   
 1   pclass       784 non-null    int64   
 2   sex          784 non-null    object  
 3   age          784 non-null    float64 
 4   sibsp        784 non-null    int64   
 5   parch        784 non-null    int64   
 6   fare         784 non-null    float64 
 7   embarked     784 non-null    object  
 8   class        784 non-null    category
 9   who          784 non-null    object  
 10  adult_male   784 non-null    bool    
 11  embark_town  784 non-null    object  
 12  alive        784 non-null    object  
 13  alone        784 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 75.9+ KB


# Task
Train a machine learning model to predict the `price` column using the data in the DataFrame `df`.

## Identify features and target

### Subtask:
Separate the DataFrame into features (X) and the target variable (y).


**Reasoning**:
Create the feature dataframe X by dropping the target variable column 'survived' from the dataframe df, and create the target series y containing only the 'survived' column.



In [24]:
X = df.drop('survived', axis=1)
y = df['survived']

## Encode categorical features

### Subtask:
Convert categorical features into a numerical format suitable for machine learning models.


**Reasoning**:
Convert the categorical features in X to numerical format using one-hot encoding and then combine them with the numerical features.



In [25]:
categorical_cols = X.select_dtypes(include=["object", "category"]).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

## Split data

### Subtask:
Split the data into training and testing sets.


**Reasoning**:
Split the data into training and testing sets using train_test_split.



In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train model

### Subtask:
Choose and train a machine learning model on the training data.


**Reasoning**:
Instantiate and train a Logistic Regression model using the training data.



In [32]:
models = {
    "LogisticRegression":LogisticRegression(),
    "KNN":KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM":SVC()
}

## Evaluate model

### Subtask:
Evaluate the performance of the trained model on the testing data.


**Reasoning**:
Make predictions on the test set, calculate accuracy, generate a classification report and confusion matrix, and print the results.



In [33]:
for name,model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(matrix)
    print("\n")

--- LogisticRegression ---
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        88
           1       1.00      1.00      1.00        69

    accuracy                           1.00       157
   macro avg       1.00      1.00      1.00       157
weighted avg       1.00      1.00      1.00       157

Confusion Matrix:
[[88  0]
 [ 0 69]]


--- KNN ---
Accuracy: 0.6815286624203821
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.75      0.73        88
           1       0.65      0.59      0.62        69

    accuracy                           0.68       157
   macro avg       0.68      0.67      0.67       157
weighted avg       0.68      0.68      0.68       157

Confusion Matrix:
[[66 22]
 [28 41]]


--- Naive Bayes ---
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00     

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
