In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
home_folder = "/home/ninja"
data_path = f"{home_folder}/datasets/titanic"
train_data_df = pd.read_csv(f"{data_path}/train.csv")
test_data_df = pd.read_csv(f"{data_path}/test.csv")
gender_data_df = pd.read_csv(f"{data_path}/gender_submission.csv")

### Plano

- `PassengerId`: descartar
- `Survived`: rótulo, `y`
- `Pclass`: fica igual
- `Name`: trocar pelas colunas do `bag-of-words`
- `Sex`: transformar em binário
- `Age`: trocar `NaN` pela média
- `SibSp`: fica igual
- `Parch`: fica igual
- `Ticket`: trocar pelas colunas do `bag-of-words`
- `Fare`: transformar em `float`
- `Cabin`:
- `Embarked`: `one-hot encoding` e excluir os `NaN`

In [None]:
def simple_preprocess(df):
    df["Fare"] = df["Fare"].astype(float)
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())
    
    df["Age"] = df["Age"].astype(float)
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    
    df["Sex"] = df["Sex"].apply(lambda x: 1 if x == "male" else 0)

    df["Cabin"] = df["Cabin"].fillna("")
    
    df["Embarked"] = df["Embarked"].fillna("S")
    df = pd.concat([df,pd.get_dummies(df["Embarked"],prefix="embarked")],axis=1)
    return df

# bow
def _create_bow(df,column,min_df,vectorizer_model):
    if vectorizer_model == "tfidf":
        vectorizer = TfidfVectorizer(
             analyzer='word',
             ngram_range=(1,1),
             max_df=1.0,
             min_df=min_df
        )
    else: 
        vectorizer = CountVectorizer(
             analyzer='word',
             ngram_range=(1,1),
             max_df=1.0,
             min_df=min_df
        )
    docs = vectorizer.fit_transform(df[column])
    bow_df = pd.DataFrame(data=docs.toarray(),columns=vectorizer.get_feature_names_out())
    return bow_df

def create_bows(df,vectorizer_model):
    bow_df = pd.DataFrame()
    for column,min_df in zip(["Name","Ticket","Cabin"],[1,2,1]):
        aux_df = _create_bow(df,column,min_df,vectorizer_model)
        bow_df = pd.concat([bow_df,aux_df],axis=1)
    return bow_df

def _run_preprocess(df,vectorizer_model):
    cols_to_drop = [
        "PassengerId",
        "Name",
        "Ticket",
        "Cabin",
        "Embarked"
    ]
    data_df = simple_preprocess(df)
    bow_df = create_bows(data_df,vectorizer_model)
    data_df = data_df.drop(cols_to_drop,axis=1)
    data_df = pd.concat([data_df,bow_df],axis=1)
    train_df = data_df[data_df["dataset"]=="train"].drop("dataset",axis=1)
    test_df = data_df[data_df["dataset"]=="test"].drop("dataset",axis=1)
    return train_df,test_df

def prepare_initial_datasets(train_data_df,test_data_df,gender_data_df):
    test_data_df = pd.merge(left=test_data_df,right=gender_data_df,how="inner",on="PassengerId")[train_data_df.columns]
    train_data_df["dataset"] = "train"
    test_data_df["dataset"] = "test"
    return pd.concat([train_data_df,test_data_df],axis=0).reset_index(drop=True)

def run_preprocess(train_data_df,test_data_df,gender_data_df,vectorizer_model="tfidf"):
    initial_df = prepare_initial_datasets(train_data_df,test_data_df,gender_data_df)
    train_df,test_df = _run_preprocess(initial_df,vectorizer_model)
    return train_df,test_df

def compute_cm(y_true,y_pred):
    return pd.DataFrame(
        confusion_matrix(y_true,y_pred),
        columns=["True negative","True positive"],
        index=["Pred. negative","Pred. positive"]
    )

In [None]:
train_df,test_df = run_preprocess(train_data_df.copy(),test_data_df.copy(),gender_data_df.copy(),"count")

In [None]:
train_df.shape,test_df.shape

In [None]:
# sparsity
1 - np.count_nonzero(train_df.values)/(train_df.shape[0]*train_df.shape[1])

### ML

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix

In [None]:
numeric_cols = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
    'embarked_C', 'embarked_Q', 'embarked_S'
]

In [None]:
model_poor = RandomForestClassifier()

X_train = train_df[numeric_cols].copy()
y_train = train_df["Survived"].copy()
X_test = test_df[numeric_cols].copy()
y_test = test_df["Survived"].copy()
model_poor.fit(X_train,y_train)

print("Test")
y_pred = model_poor.predict(X_test)
print(classification_report(y_test,y_pred))
print(f"F1 score: {f1_score(y_test,y_pred):.3f}")
compute_cm(y_test,y_pred)

In [None]:
model_rich = RandomForestClassifier()

X_train = train_df.drop("Survived",axis=1).copy()
y_train = train_df["Survived"].copy()
X_test = test_df.drop("Survived",axis=1).copy()
y_test = test_df["Survived"].copy()
model_rich.fit(X_train,y_train)

print("Test")
y_pred = model_rich.predict(X_test)
print(classification_report(y_test,y_pred))
print(f"F1 score: {f1_score(y_test,y_pred):.3f}")
compute_cm(y_test,y_pred)

### Tf-Idf

In [None]:
train_df,test_df = run_preprocess(train_data_df.copy(),test_data_df.copy(),gender_data_df.copy(),"tfidf")

In [None]:
model_poor = RandomForestClassifier()

X_train = train_df[numeric_cols].copy()
y_train = train_df["Survived"].copy()
X_test = test_df[numeric_cols].copy()
y_test = test_df["Survived"].copy()
model_poor.fit(X_train,y_train)

print("Test")
y_pred = model_poor.predict(X_test)
print(classification_report(y_test,y_pred))
print(f"F1 score: {f1_score(y_test,y_pred):.3f}")
compute_cm(y_test,y_pred)

In [None]:
model_rich = RandomForestClassifier()

X_train = train_df.drop("Survived",axis=1).copy()
y_train = train_df["Survived"].copy()
X_test = test_df.drop("Survived",axis=1).copy()
y_test = test_df["Survived"].copy()
model_rich.fit(X_train,y_train)

print("Test")
y_pred = model_rich.predict(X_test)
print(classification_report(y_test,y_pred))
print(f"F1 score: {f1_score(y_test,y_pred):.3f}")
compute_cm(y_test,y_pred)

## Outras estratégias

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def train_models(X_train,y_train):
    models = [
        LinearSVC(C=0.01,max_iter=10000),
        LinearSVC(C=0.1,max_iter=10000),
        LinearSVC(C=0.1,max_iter=10000,penalty='l1',dual=False),
        SGDClassifier(loss="log_loss",penalty='l1',epsilon=0.01),
        SGDClassifier(loss="log_loss",penalty='l1',epsilon=0.1),
        KNeighborsClassifier(n_neighbors=2,leaf_size=5,p=1),
        KNeighborsClassifier(n_neighbors=2,leaf_size=10,p=1),
        KNeighborsClassifier(n_neighbors=3,leaf_size=2,p=1),
    ]
    trained_models = []
    for clf in models:
        clf.fit(X_train,y_train)
        trained_models.append(clf)
    return trained_models

def feature_from_preds(trained_models,X):
    aux_df = pd.DataFrame()
    for i,clf in enumerate(trained_models):
        aux_df[f"clf_{i+1}"] = clf.predict(X)
    return aux_df

def eval_model(model,X,y):
    y_pred = model.predict(X)
    print(f1_score(y,y_pred))

In [None]:
X_train = train_df.drop("Survived",axis=1).copy()
y_train = train_df["Survived"].copy()
X_test = test_df.drop("Survived",axis=1).copy()
y_test = test_df["Survived"].copy()

In [None]:
trained_models = train_models(X_train,y_train)
for clf in trained_models:
    eval_model(clf,X_test,y_test)

In [None]:
X_train_ = pd.concat([X_train.reset_index(drop=True),feature_from_preds(trained_models[:5],X_train)],axis=1)
X_test_ = pd.concat([X_test.reset_index(drop=True),feature_from_preds(trained_models[:5],X_test)],axis=1)

In [None]:
model_rich = RandomForestClassifier()
model_rich.fit(X_train_,y_train)

print("Test")
y_pred = model_rich.predict(X_test_)
print(classification_report(y_test,y_pred))
print(f"F1 score: {f1_score(y_test,y_pred):.3f}")
compute_cm(y_test,y_pred)

In [None]:
X_train_ = feature_from_preds(trained_models[:5],X_train)
X_test_ = feature_from_preds(trained_models[:5],X_test)

In [None]:
model_rich = RandomForestClassifier()
model_rich.fit(X_train_,y_train)

print("Test")
y_pred = model_rich.predict(X_test_)
print(classification_report(y_test,y_pred))
print(f"F1 score: {f1_score(y_test,y_pred):.3f}")
compute_cm(y_test,y_pred)

### Fatorização de matrizes

In [None]:
from sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation

In [None]:
train_df,test_df = run_preprocess(train_data_df.copy(),test_data_df.copy(),gender_data_df.copy(),"count")

X_train = train_df.drop("Survived",axis=1).copy()
y_train = train_df["Survived"].copy()
X_test = test_df.drop("Survived",axis=1).copy()
y_test = test_df["Survived"].copy()

In [None]:
nmf = NMF(
    n_components=20,
    random_state=1,
    init=None,
    beta_loss="kullback-leibler",
    solver="mu",
    max_iter=1000,
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.1,
).fit(X_train)

In [None]:
X_train_nmf = pd.DataFrame(nmf.transform(X_train),columns=nmf.get_feature_names_out())
X_test_nmf = pd.DataFrame(nmf.transform(X_test),columns=nmf.get_feature_names_out())

X_train_ = pd.concat([X_train.reset_index(drop=True),X_train_nmf],axis=1)
X_test_ = pd.concat([X_test.reset_index(drop=True),X_test_nmf],axis=1)

In [None]:
model_rich = RandomForestClassifier()
model_rich.fit(X_train_,y_train)

print("Test")
y_pred = model_rich.predict(X_test_)
print(classification_report(y_test,y_pred))
print(f"F1 score: {f1_score(y_test,y_pred):.3f}")
compute_cm(y_test,y_pred)

In [None]:
model_rich = RandomForestClassifier()
model_rich.fit(X_train,y_train)

print("Test")
y_pred = model_rich.predict(X_test)
print(classification_report(y_test,y_pred))
print(f"F1 score: {f1_score(y_test,y_pred):.3f}")
compute_cm(y_test,y_pred)