In [1]:
%config IPCompleter.greedy=True

# Logistic regression using the iris dataset.

## Prepare modules and data.

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
iris = load_iris()

In [4]:
print(iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [5]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [6]:
X = iris.data[50:,2].reshape(-1, 1)
y = iris.target[50:] -1

In [7]:
X[:5]

array([[4.7],
       [4.5],
       [4.9],
       [4. ],
       [4.6]])

In [8]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
X_scaled[:5]

array([[-0.25077906],
       [-0.49425387],
       [-0.00730424],
       [-1.10294091],
       [-0.37251647]])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,  y, random_state=0)

## Logistic regression.

In [12]:
log_reg = LogisticRegression().fit(X_train, y_train)

In [13]:
log_reg.intercept_, log_reg.coef_

(array([0.29946432]), array([[3.16390488]]))

In [14]:
print(log_reg.score(X_train, y_train))
print(log_reg.score(X_test, y_test))

0.9466666666666667
0.88


# Logistic regression using titanic datasets.

# Prepare modules and data.

In [15]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [16]:
train_df = pd.read_csv('data/titanic_train.csv')
test_df = pd.read_csv('data/titanic_test.csv')

In [17]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
test_df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Removal of unnecessary data and completion of missing values.

In [19]:
train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [20]:
# Display lines containing null.
train_df[train_df.isnull().any(1)].head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
5,0,3,male,,0,0,8.4583
17,1,2,male,,0,0,13.0
19,1,3,female,,0,0,7.225
26,0,3,male,,0,0,7.225
28,1,3,female,,0,0,7.8792
29,0,3,male,,0,0,7.8958
31,1,1,female,,1,0,146.5208
32,1,3,female,,0,0,7.75
36,1,3,male,,0,0,7.2292
42,0,3,male,,0,0,7.8958


In [21]:
# Complete the null in the Age column with the median.
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())

# Show the line containing the null again (Age's null is completed).
train_df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [22]:
# Label the Sex data as a number.
sex_mapping = {'male': 0, 'female': 1}
train_df['Sex'] = train_df['Sex'].map(sex_mapping)
# Check the data.
train_df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


##  Logistic regression.¶
Determine if he's alive or dead based on ticket prices.

## Fare only analysis

In [23]:
# Create a list of fares only.
X_fare_only = train_df[["Fare"]]
# Create a list of life and death flags only.
y_train =  train_df["Survived"]

In [24]:
model=LogisticRegression()

In [25]:
model.fit(X_fare_only, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
model.predict([[61]])

array([0])

In [27]:
model.predict_proba([[62]])

array([[0.49978123, 0.50021877]])

## Can a male passenger survive at 30 years of age?

In [28]:
X_sex_and_age = train_df[["Sex", "Age"]]

In [29]:
model.fit(X_sex_and_age, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
model.predict([[0 , 30]])

array([0])