# PCA Preprocessing - Logistic Regression - Titanic Project
[Marcos Cavalhieri](mailto:cavalhieri@alumni.usp.br?subject=[Kaggle]%20Titanic%20MLn%20Project)
***

In [1]:
#analise dos dados e modelos de ML
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

#manipulacao dos dados
import pandas as pd
import numpy as np

#bibliotecas para geracao de graficos
import seaborn as sns
import matplotlib.pyplot as plt

In [29]:
#funcao para criar o PCA e organizar os dados em DF a partir da qtde de PCs
def create_pca(data_in, n_components):
    pca = PCA(n_components)

    #uso dos dados padronizados no DF data_in
    pca_results = pca.fit_transform(data_in)
    return (pd.DataFrame(data = pca_results, columns = ['pc' + str(i+1) for i in range(n_components)]),
            pca.explained_variance_ratio_)

In [18]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [19]:
#toda a manipulacao dos dados de entrada do modelo ML será em data_X
data_X = train_df

In [20]:
#tratamento dos dados da coluna Sex para que sejam representados numericamente
data_X['Gender_Code'] = LabelEncoder().fit_transform(data_X['Sex'])
data_X.drop(columns=['Sex'], inplace=True)
data_X

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender_Code
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,,S,1
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,,S,0
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,C148,C,1


In [21]:
data_y = data_X.Survived
data_y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [22]:
features = ['Pclass', 'Gender_Code', 'Age', 'SibSp', 'Parch']
data_X = data_X[features]
data_X

Unnamed: 0,Pclass,Gender_Code,Age,SibSp,Parch
0,3,1,22.0,1,0
1,1,0,38.0,1,0
2,3,0,26.0,0,0
3,1,0,35.0,1,0
4,3,1,35.0,0,0
...,...,...,...,...,...
886,2,1,27.0,0,0
887,1,0,19.0,0,0
888,3,0,,1,2
889,1,1,26.0,0,0


In [23]:
data_X.isnull().sum()

Pclass           0
Gender_Code      0
Age            177
SibSp            0
Parch            0
dtype: int64

In [24]:
#tratamento com valor médio manual
media = data_X.Age.mean()
media_idade = {'Age': media}
data_X = data_X.fillna(media_idade)
data_X.isnull().sum()

Pclass         0
Gender_Code    0
Age            0
SibSp          0
Parch          0
dtype: int64

In [27]:
#Standardize dos valores
scaler = StandardScaler().fit(data_X)
std_X = pd.DataFrame(scaler.transform(data_X))
std_X.columns = data_X.columns
std_X

Unnamed: 0,Pclass,Gender_Code,Age,SibSp,Parch
0,0.827377,0.737695,-0.592481,0.432793,-0.473674
1,-1.566107,-1.355574,0.638789,0.432793,-0.473674
2,0.827377,-1.355574,-0.284663,-0.474545,-0.473674
3,-1.566107,-1.355574,0.407926,0.432793,-0.473674
4,0.827377,0.737695,0.407926,-0.474545,-0.473674
...,...,...,...,...,...
886,-0.369365,0.737695,-0.207709,-0.474545,-0.473674
887,-1.566107,-1.355574,-0.823344,-0.474545,-0.473674
888,0.827377,-1.355574,0.000000,0.432793,2.008933
889,-1.566107,0.737695,-0.284663,-0.474545,-0.473674


In [33]:
total_pc = 1
pc_train, ratio_train = create_pca(std_X[['SibSp', 'Parch']], total_pc)
print('PCA Train: ' + str(ratio_train))

PCA Train: [0.70741885]


In [71]:
clf = LogisticRegression().fit(data_X, data_y)
clf.score(data_X, data_y)

0.7901234567901234

In [72]:
y_pred = clf.predict(data_X)
print(classification_report(data_y, y_pred))

              precision    recall  f1-score   support

         0.0       0.82      0.84      0.83       549
         1.0       0.74      0.71      0.72       342

    accuracy                           0.79       891
   macro avg       0.78      0.77      0.78       891
weighted avg       0.79      0.79      0.79       891



In [73]:
#usando o teste como dados de entrada e tratamento da coluna Sex como dados numericos
data_X = test_df
data_X['Gender_Code'] = LabelEncoder().fit_transform(data_X['Sex'])
data_X.drop(columns=['Sex'], inplace=True)
data_X

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender_Code
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0000,,S,0
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,,Q,1
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,,S,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",,0,0,A.5. 3236,8.0500,,S,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",39.0,0,0,PC 17758,108.9000,C105,C,0
415,1307,3,"Saether, Mr. Simon Sivertsen",38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1
416,1308,3,"Ware, Mr. Frederick",,0,0,359309,8.0500,,S,1


In [8]:
features = ['Pclass', 'Gender_Code', 'Age', 'SibSp', 'Parch']
data_X = data_X[features]
data_X

Unnamed: 0,Pclass,Gender_Code,Age,SibSp,Parch
0,3,1,22.000000,1,0
1,1,0,38.000000,1,0
2,3,0,26.000000,0,0
3,1,0,35.000000,1,0
4,3,1,35.000000,0,0
...,...,...,...,...,...
886,2,1,27.000000,0,0
887,1,0,19.000000,0,0
888,3,0,29.699118,1,2
889,1,1,26.000000,0,0


In [11]:
data_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       891 non-null    int64  
 1   Gender_Code  891 non-null    int64  
 2   Age          891 non-null    float64
 3   SibSp        891 non-null    int64  
 4   Parch        891 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 34.9 KB


In [90]:
y_pred = clf.predict(data_X)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': y_pred})
output.to_csv('my_submission.csv', index=False)
print("All Right!")

All Right!


In [91]:
y_pred = clf.predict(data_X)
len(y_pred)

418