In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [2]:
# data / dados
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.isna().sum() / len(train)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

### Data to clean / transform

Useless columns:
- **PassengerId**, **Ticket** and **Name**: do not have any importance for the classification
- **Cabin**: since about 70% of this column are missing, the best solution is to eliminate it

Useless rows:
- **Embarked**: less than 1% of this column is missing, so we just get rid of it without big consequences for the classification

Missing values:
- **Age**: some age values are missing. In order to get as close as possible to the real ones, the best solution is to replace this values for the average or median age of the people in the same class and sex as the missing.

In [4]:
def cleanData(data):
    '''Function to clean the datasets'''
    
#     useless columns
    data.drop(columns=['PassengerId', 'Name', 'Cabin', 'Ticket'], inplace=True)
    
#     useless rows
    data.dropna(subset=['Embarked'], inplace=True)
    
#     missing values
    data['Age'] = data.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
    data['Fare'] = data.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))
    
#     transforming categoric into numeric data
    data['Sex'].replace({'male':0, 'female':1}, inplace=True)
    
    data['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
    
    return data



In [5]:
clean_train = cleanData(train)
clean_test = cleanData(test)

In [6]:
models = [DecisionTreeClassifier(), SVC(), RandomForestClassifier(), AdaBoostClassifier(), 
          GradientBoostingClassifier(), LogisticRegression(), SGDClassifier()]

In [7]:
X = train.drop(columns='Survived')
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [13]:
# testing models / testando os modelos

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"model: {model}, accuracy: {acc}\n")


model: DecisionTreeClassifier(), accuracy: 0.7757847533632287

model: SVC(), accuracy: 0.6905829596412556

model: RandomForestClassifier(), accuracy: 0.7668161434977578

model: AdaBoostClassifier(), accuracy: 0.8251121076233184

model: GradientBoostingClassifier(), accuracy: 0.8251121076233184

model: LogisticRegression(), accuracy: 0.820627802690583

model: SGDClassifier(), accuracy: 0.7354260089686099



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
# the most accurate model / o modelo mais acurado
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

survived = model.predict(clean_test)

In [10]:
test = pd.read_csv('test.csv')

In [11]:
submission = pd.DataFrame({"PassengerId": test.PassengerId, "Survived": survived})
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [15]:
submission.to_csv("submission.csv", index=False)