# Kaggle Tutorial
## Steps

### Read data
### EDA - Exploratory Data Analysis
### Preprocessare
### Model fit
### Model Validation
### Metrics

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
np.array([1, 2, 3])

## Read Data

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")

In [None]:
train_data.tail(5)

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
test_data.head(5)

In [None]:
print(f"% of people who survived: {sum(train_data['Survived']) / len(train_data) * 100:.2f}%")

In [None]:
sum(train_data.loc[train_data['Sex']=='male']['Survived']) / len(train_data.loc[train_data['Sex']=='male'])

In [None]:
sum(train_data.loc[train_data['Sex']=='female']['Survived']) / len(train_data.loc[train_data['Sex']=='female'])

In [None]:
train_data.loc[train_data['Sex']=='male']

In [None]:
print(f"% of male {len(train_data.loc[train_data['Sex']=='male']) / len(train_data) * 100:.2f}%")

## EDA - Exploratory Data Analysis

## Visualize analytics for all features

In [None]:
!pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport

In [None]:
report = ProfileReport(train_data, title="Titanic Train Report", correlations = {
    "pearson": {"calculate": True},
    "spearman": {"calculate": True},
    "kendall": {"calculate": False},
  },)

In [None]:
report

In [None]:
train_data.head(5)

In [None]:
train_data['Age'].hist();

## Create a new feature FamilySize

In [None]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']

In [None]:
train_data.head(5)

### All changes or new features in training data must reflect in test data

In [None]:
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']

In [None]:
test_data.head(5)

## Extract Title component from Name feature

### Any change to be persistent must be assigned to the dataframe
### Changes are not in place

In [None]:
train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
train_data

In [None]:
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

train_data['Title'] = train_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test_data['Title'] = test_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train_data['Title'] = train_data['Title'].replace(['Mlle', 'Ms'], 'Miss')
test_data['Title'] = test_data['Title'].replace(['Mlle', 'Ms'], 'Miss')

train_data['Title'] = train_data['Title'].replace('Mme', 'Mrs')
test_data['Title'] = test_data['Title'].replace('Mme', 'Mrs')

In [None]:
train_data

## Memorize test data IDs for the sumbission data frame

In [None]:
test_ids = test_data['PassengerId']

In [None]:
test_ids

## Drop unnecessary columns from the model

### Once again, changes are not persistent and must be assigned to the original dataframe

In [None]:
train_data.drop(["Ticket", "Name", "PassengerId", "SibSp", "Parch"], axis=1)

In [None]:
train_data

In [None]:
def drop_unnecessary_columns(data):
    data = data.drop(["Ticket", "Name", "PassengerId", "SibSp", "Parch"], axis=1)
    return data

train_data = drop_unnecessary_columns(train_data)
test_data = drop_unnecessary_columns(test_data)

In [None]:
train_data

## Split data in train validation

In [None]:
X = train_data.drop(['Survived'], axis=1)
y = train_data['Survived']

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Stratify assures train and validation have same percent of 0 and 1 on the Survived target feature

In [None]:
sum(y_train) / len(y_train)

In [None]:
sum(y_val) / len(y_val)

In [None]:
sum(train_data['Survived']) / len(train_data)

In [None]:
pd.isna(X_train['Age'])

In [None]:
X_train['Age']

## Data imputation - For missing data replace missing data with median per specific data distribution

### Example: Age median based on Sex and Pclass

In [None]:
X_train.groupby(['Sex', 'Pclass'])['Age'].median()

In [None]:
print("Yes") if 1 == 1 else print("No")

In [None]:
X_train['Age'].fillna(X_train['Age'].median())

In [None]:
# Compute median Age for each (Sex, Pclass) group from training data
age_medians = X_train.groupby(['Sex', 'Pclass'])['Age'].median()

# Fill missing Age values in train and test data using the training medians
X_train['Age'] = X_train.apply(lambda row: age_medians[row['Sex'], row['Pclass']] if pd.isna(row['Age']) else row['Age'], axis=1)
X_val['Age'] = X_val.apply(lambda row: age_medians[row['Sex'], row['Pclass']] if pd.isna(row['Age']) else row['Age'], axis=1)
test_data['Age'] = test_data.apply(lambda row: age_medians[row['Sex'], row['Pclass']] if pd.isna(row['Age']) else row['Age'], axis=1)

In [None]:
X_train.isnull().any()

In [None]:
X_train.groupby(['FamilySize', 'Pclass'])['Fare'].median()

In [None]:
# Compute median Fare for each (SibSp, Parch, Pclass) group from training data
fare_medians = X_train.groupby(['FamilySize', 'Pclass'])['Fare'].median()

# Fill missing Fare values in train and test data using the training medians
X_train['Fare'] = X_train.apply(lambda row: fare_medians[row['FamilySize'], row['Pclass']] if pd.isna(row['Fare']) else row['Fare'], axis=1)
X_val['Fare'] = X_val.apply(lambda row: fare_medians[row['FamilySize'], row['Pclass']] if pd.isna(row['Fare']) else row['Fare'], axis=1)
test_data['Fare'] = test_data.apply(lambda row: fare_medians[row['FamilySize'], row['Pclass']] if pd.isna(row['Fare']) else row['Fare'], axis=1)

In [None]:
X_train

In [None]:
X_train

In [None]:
def fill_missing_values_with_zero(data):
    cols = ['Cabin'] # 77% din date lipsa

    for col in cols:
        data[col] = data[col].fillna(0)
        data[col] = data[col].apply(lambda x: 1 if type(x) is str else 0)
        
    return data

def fill_missing_values_with_mode(train_data, validation_data, test_data):
    cols = ['Embarked'] # 2 date lipsa
    
    for col in cols:
        train_data[col] = train_data[col].fillna(train_data[col].mode()[0])
        validation_data[col] = validation_data[col].fillna(train_data[col].mode()[0])
        test_data[col] = test_data[col].fillna(train_data[col].mode()[0])

    return train_data, validation_data, test_data


X_train = fill_missing_values_with_zero(X_train)
X_val = fill_missing_values_with_zero(X_val)
test_data = fill_missing_values_with_zero(test_data)

X_train, X_val, test_data = fill_missing_values_with_mode(X_train, X_val, test_data)

In [None]:
X_train

## Replace text/string from columns with labels

### female -> 0
### male -> 1

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
label_encoder#.transform

### fit_transform - learns the labels and returns them
### transform - applies the learned labels

In [None]:


cols = {'Embarked', 'Title', 'Sex'}

for col in cols:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_val[col] = label_encoder.transform(X_val[col])
    test_data[col] = label_encoder.transform(test_data[col])
    print(label_encoder.classes_)

In [None]:
(train_data.loc[train_data['Title']=='Master'])

In [None]:
(X_train['FamilySize'] == 0).astype(int)

## Create new feature - IsAlone
### Single people on Titanic have lower survival rate

In [None]:
X_train['IsAlone'] = (X_train['FamilySize'] == 0).astype(int)
X_val['IsAlone'] = (X_val['FamilySize'] == 0).astype(int)
test_data['IsAlone'] = (test_data['FamilySize'] == 0).astype(int)

In [None]:
X_train.columns

## features list specifies the features on which the algorithm is applied

### We can remove some features to see how the metrics are changed: increase/decrease

In [None]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked',
       'FamilySize', 'Title', 'IsAlone']

from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
from sklearn.naive_bayes import GaussianNB


# Train GaussianNB on continuous features
gnb = GaussianNB()
gnb.fit(X_train[features], y_train) # fit = antrenare
gnb_probs = gnb.predict(X_train[features])

In [None]:
gnb_probs

In [None]:
# Evaluate performance
accuracy = accuracy_score(y_train, gnb_probs)
print(f"Naïve Bayes Train Accuracy: {accuracy:.4f}")

In [None]:
confusion_matrix(y_train, gnb_probs)

In [None]:
# Compute Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_train, gnb_probs).ravel()

# Compute FPR & FNR
fpr = fp / (fp + tn)  # False Positive Rate
fnr = fn / (fn + tp)  # False Negative Rate

print(f"False Positive Rate (FPR): {fpr:.4f}")
print(f"False Negative Rate (FNR): {fnr:.4f}")

In [None]:
gnb_probs_val = gnb.predict(X_val[features])

# Evaluate performance
accuracy_val = accuracy_score(y_val, gnb_probs_val)
print(f"Naïve Bayes Validation Accuracy: {accuracy_val:.4f}")

# Compute Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_val, gnb_probs_val).ravel()

# Compute FPR & FNR
fpr = fp / (fp + tn)  # False Positive Rate
fnr = fn / (fn + tp)  # False Negative Rate

print(f"False Positive Rate (FPR): {fpr:.4f}")
print(f"False Negative Rate (FNR): {fnr:.4f}")

In [None]:
test_data

In [None]:
gnb_probs_test = gnb.predict(test_data[features])

In [None]:
gnb_probs_test

In [None]:
test_ids.values

## Write results into a new dataframe

In [None]:
result_df = pd.DataFrame({"PassengerId": test_ids.values})

In [None]:
result_df['Survived'] = gnb_probs_test

In [None]:
result_df.to_csv("NaiveBayes.csv", index=False)

## 1) Women first!

In [None]:
test_data['Survived'] = 0

In [None]:
test_data.loc[test_data['Sex'] == 0, "Survived"] = 1

In [None]:
submission_women = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': test_data['Survived']
})
submission_women.to_csv('submission_women_first.csv', index=False)

In [None]:
test_data

In [None]:
test_data['Survived'] = 0
test_data.loc[(test_data['Sex'] == 0) | (test_data['Age'] < 12), 'Survived'] = 1

In [None]:
submission_women_children = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': test_data['Survived']
})
submission_women_children.to_csv('submission_women_and_children_first.csv', index=False)

In [None]:
test_data