In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from matplotlib.colors import ListedColormap

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv("input/train.csv")

dataset.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
count,850.0,850.0,686.0,850.0,850.0,849.0,850.0
mean,656.848235,2.295294,29.850583,0.470588,0.368235,33.507121,0.382353
std,371.7016,0.832837,14.141324,0.954664,0.86534,53.553686,0.486248
min,1.0,1.0,0.33,0.0,0.0,0.0,0.0
25%,340.75,2.0,21.0,0.0,0.0,7.8958,0.0
50%,657.0,3.0,28.0,0.0,0.0,14.4,0.0
75%,969.75,3.0,38.0,1.0,0.0,30.5,1.0
max,1308.0,3.0,80.0,8.0,9.0,512.3292,1.0


In [3]:
print(dataset.info())
dataset.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 12 columns):
PassengerId    850 non-null int64
Pclass         850 non-null int64
Name           850 non-null object
Sex            850 non-null object
Age            686 non-null float64
SibSp          850 non-null int64
Parch          850 non-null int64
Ticket         850 non-null object
Fare           849 non-null float64
Cabin          182 non-null object
Embarked       850 non-null object
Survived       850 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 79.8+ KB
None


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,1
1,1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S,0
2,1240,2,"Giles, Mr. Ralph",male,24.0,0,0,248726,13.5,,S,0
3,221,3,"Sunderland, Mr. Victor Francis",male,16.0,0,0,SOTON/OQ 392089,8.05,,S,1
4,1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60.0,1,0,24065,26.0,,S,0


In [4]:
#cabin = dataset[['Cabin', 'Survived']]
#cabin.Survived[cabin.Cabin.notnull()].mean() / cabin.Survived.mean()

In [5]:
print("Уникальных элементов:", dataset.Sex.nunique())
print("Элементы:", dataset.Sex.unique())

dev_sum = dataset.Survived[dataset.Sex == 'female'].sum() / dataset.Survived.sum()
print(r'Процент женщин среди выживших:{}%'.format(dev_sum * 100))

Уникальных элементов: 2
Элементы: ['female' 'male']
Процент женщин среди выживших:68.0%


In [6]:
dataset['Man'] = 1
logic_mask = (dataset.Sex == 'female')
dataset.Man[logic_mask] = 0
dataset.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,Man
0,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,1,0
1,1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S,0,1
2,1240,2,"Giles, Mr. Ralph",male,24.0,0,0,248726,13.5,,S,0,1
3,221,3,"Sunderland, Mr. Victor Francis",male,16.0,0,0,SOTON/OQ 392089,8.05,,S,1,1
4,1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60.0,1,0,24065,26.0,,S,0,0


In [7]:
# print("Уникальных элементов:", dataset.Embarked.nunique())
# print("Элементы:", dataset.Embarked.unique())
# embarked = np.ones((dataset.shape[0], 1))
# embarked[dataset.Embarked == 'C'] = 2
# embarked[dataset.Embarked == 'Q'] = 3

In [8]:
Y_train = dataset['Survived'].to_numpy().reshape(dataset.shape[0], 1)

dataset = dataset.drop(['PassengerId', 'Name', 'Embarked', 'Ticket', 'Cabin', 'Sex', 'Survived'], axis = 1)

dataset.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Man
0,1,19.0,0,0,30.0,0
1,3,,0,0,7.8792,1
2,2,24.0,0,0,13.5,1
3,3,16.0,0,0,8.05,1
4,2,60.0,1,0,26.0,0
5,3,9.0,4,2,31.3875,1
6,3,18.0,0,0,8.05,0
7,1,41.0,0,0,134.5,0
8,3,16.0,1,3,34.375,1
9,2,40.0,0,0,13.0,0


In [9]:
dataset.Age[dataset.Age.isnull()] = dataset.Age[dataset.Age.notnull()].mean()
dataset.Fare[dataset.Fare.isnull()] = dataset.Fare[dataset.Fare.notnull()].mean()

print(dataset.info())
dataset.head(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 6 columns):
Pclass    850 non-null int64
Age       850 non-null float64
SibSp     850 non-null int64
Parch     850 non-null int64
Fare      850 non-null float64
Man       850 non-null int64
dtypes: float64(2), int64(4)
memory usage: 40.0 KB
None


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Man
0,1,19.0,0,0,30.0,0
1,3,29.850583,0,0,7.8792,1
2,2,24.0,0,0,13.5,1
3,3,16.0,0,0,8.05,1
4,2,60.0,1,0,26.0,0
5,3,9.0,4,2,31.3875,1
6,3,18.0,0,0,8.05,0
7,1,41.0,0,0,134.5,0
8,3,16.0,1,3,34.375,1
9,2,40.0,0,0,13.0,0


In [10]:
X_train = dataset.to_numpy().reshape(dataset.shape)
print(X_train.shape)

# print("Not normilize:")
# print(X_train)

X_train = (X_train - X_train.min(axis = 0)) / (X_train.max(axis = 0) - X_train.min(axis = 0))

type(X_train)
# print("Normilize:")
# print(X_train)

(850, 6)


numpy.ndarray

In [11]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
w = np.append(logreg_model.intercept_, logreg_model.coef_)
w = w.reshape(X_train.shape[1] + 1, 1)
print("Weight coefs:\n", w)

Weight coefs: [[ 2.57458496]
 [-1.64358095]
 [-1.46535065]
 [-0.89038366]
 [-0.13756374]
 [ 0.89570347]
 [-2.45340744]]


In [13]:
testset = pd.read_csv("input/test.csv")

testset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 11 columns):
PassengerId    459 non-null int64
Pclass         459 non-null int64
Name           459 non-null object
Sex            459 non-null object
Age            360 non-null float64
SibSp          459 non-null int64
Parch          459 non-null int64
Ticket         459 non-null object
Fare           459 non-null float64
Cabin          113 non-null object
Embarked       457 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 39.6+ KB


In [14]:
testset['Man'] = 1
logic_mask = (testset.Sex == 'female')
testset.Man[logic_mask] = 0
testset.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Man
0,1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg",female,20.0,1,0,236853,26.0,,S,0
1,1215,1,"Rowe, Mr. Alfred G",male,33.0,0,0,113790,26.55,,S,1
2,823,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0,,S,1
3,864,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S,0
4,11,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,0


In [15]:
Pass_Id = testset['PassengerId']

testset = testset.drop(['PassengerId', 'Name', 'Embarked', 'Ticket', 'Cabin', 'Sex'], axis = 1)

testset.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Man
0,2,20.0,1,0,26.0,0
1,1,33.0,0,0,26.55,1
2,1,38.0,0,0,0.0,1
3,3,,8,2,69.55,0
4,3,4.0,1,1,16.7,0
5,1,13.0,2,2,262.375,1
6,3,16.0,1,1,8.5167,0
7,2,18.0,0,0,11.5,1
8,1,24.0,0,0,69.3,0
9,3,,1,0,15.5,0


In [16]:
testset.Age[testset.Age.isnull()] = testset.Age[testset.Age.notnull()].mean()
testset.Fare[testset.Fare.isnull()] = testset.Fare[testset.Fare.notnull()].mean()

print(testset.info())
testset.head(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 6 columns):
Pclass    459 non-null int64
Age       459 non-null float64
SibSp     459 non-null int64
Parch     459 non-null int64
Fare      459 non-null float64
Man       459 non-null int64
dtypes: float64(2), int64(4)
memory usage: 21.6 KB
None


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Man
0,2,20.0,1,0,26.0,0
1,1,33.0,0,0,26.55,1
2,1,38.0,0,0,0.0,1
3,3,29.939361,8,2,69.55,0
4,3,4.0,1,1,16.7,0
5,1,13.0,2,2,262.375,1
6,3,16.0,1,1,8.5167,0
7,2,18.0,0,0,11.5,1
8,1,24.0,0,0,69.3,0
9,3,29.939361,1,0,15.5,0


In [17]:
X_test = testset.to_numpy().reshape(testset.shape)
print(X_test.shape)

X_test = (X_test - X_test.min(axis = 0)) / (X_test.max(axis = 0) - X_test.min(axis = 0))
#X_test = np.insert(X_test, 0, 1, axis = 1)
#print(X_test.shape)

(459, 6)


In [20]:
Y_test = logreg_model.predict(X_test)
Y_test = Y_test.reshape(X_test.shape[0], 1)
Y_test
Ans = pd.DataFrame(Pass_Id)

array([[1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
    