<h1>Titanic Dataframe analysis</h1>

<h2>Importação de dados</h2>

In [117]:
import pandas as pd
import numpy as np
from scipy import stats
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
passengers = df_test["PassengerId"]
df.head(15)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


<h2>Tratamento dos dados</h2>

In [118]:
def separa_titulo(nome):
    titulo = nome.split(",")[1].split(".")[0].strip()
    return titulo

df['Title'] = df['Name'].apply(separa_titulo)

df_test['Title'] = df_test['Name'].apply(separa_titulo)

In [119]:
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}
df["Title"] = df["Title"].map(normalized_titles)

df_test["Title"] = df_test["Title"].map(normalized_titles)

In [120]:
df[df["Age"].isnull()].groupby(["Title"]).count()["PassengerId"]

Title
Master       4
Miss        36
Mr         119
Mrs         17
Officer      1
Name: PassengerId, dtype: int64

In [121]:
adults_age = np.round(df[df["Title"].isin(["Mr","Mrs","Officer","Royalty"])]["Age"].mean())
miss_age = np.round(df[df["Title"].isin(["Miss"])]["Age"].mean())
master_age = np.round(df[df["Title"].isin(["Master"])]["Age"].mean())
print(adults_age,miss_age,master_age)

34.0 22.0 5.0


In [122]:
df.loc[df["Title"].isin(["Mr","Mrs","Officer","Royalty"]) & df["Age"].isnull(),"Age"] = adults_age
df.loc[df["Title"].isin(["Miss"]) & df["Age"].isnull(),"Age"]= miss_age
df.loc[df["Title"].isin(["Master"]) & df["Age"].isnull(),"Age"] = master_age

df_test.loc[df_test["Title"].isin(["Mr","Mrs","Officer","Royalty"]) & df_test["Age"].isnull(),"Age"] = adults_age
df_test.loc[df_test["Title"].isin(["Miss"]) & df_test["Age"].isnull(),"Age"]= miss_age
df_test.loc[df_test["Title"].isin(["Master"]) & df_test["Age"].isnull(),"Age"] = master_age

In [123]:
df.drop(columns=["Name","PassengerId","Ticket"],inplace=True)

df_test.drop(columns=["Name","PassengerId","Ticket"],inplace=True)

In [124]:
df['Cabin'].fillna('Missing',inplace=True)
df['Cabin'] = df['Cabin'].str[0]

df_test['Cabin'].fillna('Missing',inplace=True)
df_test['Cabin'] = df_test['Cabin'].str[0]

In [125]:
df["FamSize"] = df["SibSp"] + df["Parch"]

df_test["FamSize"] = df_test["SibSp"] + df_test["Parch"]

In [126]:
df["Embarked"].fillna('S',inplace=True)

df_test["Embarked"].fillna('S',inplace=True)

In [127]:
df['Pclass'] = df['Pclass'].astype('category')
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
df['Title'] = df['Title'].astype('category')
df['Cabin'] = df['Cabin'].astype('category')


df_test['Pclass'] = df_test['Pclass'].astype('category')
df_test['Sex'] = df_test['Sex'].astype('category')
df_test['Embarked'] = df_test['Embarked'].astype('category')
df_test['Title'] = df_test['Title'].astype('category')
df_test['Cabin'] = df_test['Cabin'].astype('category')



In [128]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

df['Pclass'] = labelencoder.fit_transform(df['Pclass'])
df['Sex'] = labelencoder.fit_transform(df['Sex'])
df['Title'] = labelencoder.fit_transform(df['Title'])
df['Embarked'] = labelencoder.fit_transform(df['Embarked'])
df['Cabin'] = labelencoder.fit_transform(df['Cabin'])

df_test['Pclass'] = labelencoder.fit_transform(df_test['Pclass'])
df_test['Sex'] = labelencoder.fit_transform(df_test['Sex'])
df_test['Title'] = labelencoder.fit_transform(df_test['Title'])
df_test['Embarked'] = labelencoder.fit_transform(df_test['Embarked'])
df_test['Cabin'] = labelencoder.fit_transform(df_test['Cabin'])

In [129]:
def minmax_norm(column):
    return (column - column.min()) / (column.max() - column.min())

df['Fare'] = minmax_norm(df['Fare'])
df_test['Fare'] = minmax_norm(df_test['Fare'])

In [130]:
df_test.head(20)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,FamSize
0,2,1,34.5,0,0,0.015282,7,1,2,0
1,2,0,47.0,1,0,0.013663,7,2,3,1
2,1,1,62.0,0,0,0.018909,7,1,2,0
3,2,1,27.0,0,0,0.016908,7,2,2,0
4,2,0,22.0,1,1,0.023984,7,2,3,2
5,2,1,14.0,0,0,0.018006,7,2,2,0
6,2,0,30.0,0,0,0.014891,7,1,1,0
7,1,1,26.0,1,1,0.056604,7,2,2,2
8,2,0,18.0,0,0,0.01411,7,0,3,0
9,2,1,21.0,2,0,0.047138,7,2,2,2


In [131]:
df.head(20)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,FamSize
0,0,2,1,22.0,1,0,0.014151,7,2,2,1
1,1,0,0,38.0,1,0,0.139136,2,0,3,1
2,1,2,0,26.0,0,0,0.015469,7,2,1,0
3,1,0,0,35.0,1,0,0.103644,2,2,3,1
4,0,2,1,35.0,0,0,0.015713,7,2,2,0
5,0,2,1,34.0,0,0,0.01651,7,1,2,0
6,0,0,1,54.0,0,0,0.101229,4,2,2,0
7,0,2,1,2.0,3,1,0.041136,7,2,0,4
8,1,2,0,27.0,0,2,0.021731,7,2,3,2
9,1,1,0,14.0,1,0,0.058694,7,0,3,1


In [132]:
X = df[["Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked","Title","FamSize"]]
Y = df["Survived"]

In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.25,random_state=0)

In [134]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

y_pred = logreg.predict(X_test)

logreg.score(X_test, Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8026905829596412

In [135]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,FamSize
0,2,1,34.5,0,0,0.015282,7,1,2,0
1,2,0,47.0,1,0,0.013663,7,2,3,1
2,1,1,62.0,0,0,0.018909,7,1,2,0
3,2,1,27.0,0,0,0.016908,7,2,2,0
4,2,0,22.0,1,1,0.023984,7,2,3,2


In [136]:
df_test["Fare"].fillna(df_test["Fare"].mean(),inplace=True)

In [137]:
Resultado = logreg.predict(df_test)

In [138]:
len(Resultado)

418

In [151]:
d = {'PassengerId': passengers, 'Survived': Resultado}
tabfinal = pd.DataFrame(data=d)
tabfinal.set_index("PassengerId",inplace=True)

In [152]:
tabfinal.to_csv("ResultadoKaggle.csv")