In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sns

data = pd.read_csv("titanic2.tsv", sep='\t')
data.head(10)

Unnamed: 0,Survived,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck,Title,TravelCompanion
0,0,530,2,0,23.0,2,1,11.5,0,0,0,0
1,0,466,3,0,38.0,0,0,7.05,0,0,0,0
2,0,753,3,0,33.0,0,0,9.5,0,0,0,0
3,0,855,2,1,44.0,1,0,26.0,0,0,2,0
4,0,333,1,0,38.0,0,1,153.4625,0,1,0,1
5,0,39,3,1,18.0,2,0,18.0,0,0,1,1
6,0,236,3,1,29.36,0,0,7.55,0,0,1,0
7,0,303,3,0,19.0,0,0,0.0,0,0,0,1
8,1,18,2,0,29.36,0,0,13.0,0,0,0,0
9,1,505,1,1,16.0,0,0,86.5,0,2,1,0


In [28]:
data_train, data_test = train_test_split(data, test_size=0.2,random_state=27)

In [29]:
data_train

Unnamed: 0,Survived,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck,Title,TravelCompanion
29,1,751,2,1,4.00,1,1,23.0000,0,0,1,0
227,0,419,2,0,30.00,0,0,13.0000,0,0,0,0
83,1,196,1,1,58.00,0,0,146.5208,1,2,1,0
567,0,17,3,0,2.00,4,1,29.1250,2,0,3,4
66,0,229,2,0,18.00,0,0,13.0000,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
537,0,111,1,0,47.00,0,0,52.0000,0,1,0,0
24,1,534,3,1,29.36,0,2,22.3583,1,0,2,1
293,1,129,3,1,29.36,1,1,22.3583,1,6,1,1
543,0,715,2,0,52.00,0,0,13.0000,0,0,0,1


### Przygotowanie danych do uczenia, odstające dane 

In [30]:
data_train.isnull().sum()

Survived           0
PassengerId        0
Pclass             0
Sex                0
Age                0
SibSp              0
Parch              0
Fare               0
Embarked           0
CabinDeck          0
Title              0
TravelCompanion    0
dtype: int64

In [31]:
data_train.agg(["std"])

Unnamed: 0,Survived,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck,Title,TravelCompanion
std,0.48814,256.856243,0.828319,0.480318,13.226157,1.065426,0.766523,38.673031,0.637136,1.527899,1.053988,1.041325


In [32]:
data_train.agg(['kurtosis', 'skew'])

Unnamed: 0,Survived,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck,Title,TravelCompanion
kurtosis,-1.800838,-1.200561,-1.192739,-1.661358,0.723043,13.318185,7.98517,12.916561,1.028985,5.372263,1.124941,5.173054
skew,0.454325,0.053354,-0.692562,0.587648,0.479507,3.208947,2.544209,3.234868,1.514314,2.441386,1.361482,2.318677


In [88]:
quantile1 = data_train.quantile(0.05) 
quantile3 = data_train.quantile(0.95)
interquartile_range = quantile3 - quantile1 

In [89]:
interquartile_range

Survived             1.0000
PassengerId        794.6000
Pclass               2.0000
Sex                  1.0000
Age                 49.1500
SibSp                3.0000
Parch                2.0000
Fare                88.5808
Embarked             2.0000
CabinDeck            4.0000
Title                3.0000
TravelCompanion      3.0000
dtype: float64

In [90]:
low_boundary = (quantile1  - 1.5 * interquartile_range)
upp_boundary = (quantile3 + 1.5 * interquartile_range)
outliers_L = (data_train[interquartile_range.index] < low_boundary).sum()
outliers_U = (data_train[interquartile_range.index] > upp_boundary).sum()

Procent odstających danych w każdej z kategorii:

In [91]:
outliers_percentage = ((outliers_U + outliers_L) / len(data_train)) * 100
for i, col in enumerate(data_train.columns):
    if outliers_percentage.iloc[i] == 0.0:
        print("{}: \033[92m{:.2f}%\033[0m".format(col, outliers_percentage.iloc[i]))
    elif outliers_percentage.iloc[i] > 10.0:
        print("{}: \033[91m{:.2f}%\033[0m".format(col, outliers_percentage.iloc[i])) 
    else:
        print("{}: \033[33m{:.2f}%\033[0m".format(col, outliers_percentage.iloc[i]))

Survived: [92m0.00%[0m
PassengerId: [92m0.00%[0m
Pclass: [92m0.00%[0m
Sex: [92m0.00%[0m
Age: [92m0.00%[0m
SibSp: [33m0.40%[0m
Parch: [92m0.00%[0m
Fare: [33m0.60%[0m
Embarked: [92m0.00%[0m
CabinDeck: [92m0.00%[0m
Title: [92m0.00%[0m
TravelCompanion: [92m0.00%[0m


In [92]:
data_out = data_train.loc[(data_train["TravelCompanion"] < low_boundary["TravelCompanion"]) | (data_train["TravelCompanion"] > upp_boundary["TravelCompanion"])]                            
len(data_out)

0

In [93]:
def remove_outliers(data, column):
    data_out = data.loc[(data[column] > low_boundary[column]) & (data[column] < upp_boundary[column])]
    return data_out

In [94]:
data_train_out = remove_outliers(remove_outliers(remove_outliers(remove_outliers(remove_outliers(data_train, "SibSp"),"Parch"),"Fare"),"CabinDeck"),"TravelCompanion")

In [95]:
print('Rozmiar zbioru z obserwacjami odstającymi:', len(data_train), "\nRozmiar zbioru po usunięciu obserwacji odstających:", len(data_train_out))

Rozmiar zbioru z obserwacjami odstającymi: 498 
Rozmiar zbioru po usunięciu obserwacji odstających: 491


### Stworzenie modelu

In [96]:
y_train = data_train_out["Survived"]
X_train = data_train_out.iloc[:, 1:]
model = LogisticRegression(random_state=0)
model.fit(X_train, y_train.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [97]:
#sns.regplot(x=data.iloc[:,3], y=data.iloc[:,0])

In [98]:
y_expected = data_test["Survived"]
x_test = data_test.iloc[:, 1:]
y_predicted = model.predict(x_test)

In [99]:
print(y_predicted[:30])

[0 0 0 1 0 1 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0]


#### Ewaluacja regresji

In [100]:
error = mean_squared_error(y_expected, y_predicted)
print(f"Błąd średniokwadratowy wynosi {error}")

Błąd średniokwadratowy wynosi 0.168


In [101]:
print(model.score(x_test, y_expected))

0.832


#### Ewaluacja klasyfikacji

In [102]:
precision, recall, fscore, support = precision_recall_fscore_support(y_expected, y_predicted, average="weighted")

In [103]:
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F-score: {fscore:.3f}")

Precision: 0.830
Recall: 0.832
F-score: 0.829
