Do the imports. Packages to import are 
> pandas  
> OneHotEncoder  
> LogisticRegression  
> MakeColumnTransformer  
> make_pipeline 
> CountVectorizer

In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

Make the list of columns we want in the model, the 'Parch' (the number of parents or siblings they had on the trip), the amount they paid to get on the boat, the port they got on the boat at, and thier gender. 

In [2]:
cols = ['Parch', 'Fare', 'Embarked', 'Sex']

Read the data into a Pandas DateFrame from the website abbreviated version of the kaggle website. Import the \
training set. Then assign the columns and the target variable to X and y, respectively. 

In [3]:
url = 'http://bit.ly/kaggletrain'
df = pd.read_csv(url, nrows=10)

In [4]:
X = df[cols]
y = df['Survived']

Download and assign the testing set to the X_new variable.

In [5]:
url_test = 'http://bit.ly/kaggletest'
df_new = pd.read_csv(url_test, nrows=10)
X_new = df_new[cols]

Instantiate the OneHotEncoder.

In [6]:
ohe = OneHotEncoder()
ohe.fit(df[['Embarked','Sex']])
ohe.categories_

[array(['C', 'Q', 'S'], dtype=object), array(['female', 'male'], dtype=object)]

Construct and instatiate the column transformer. 

In [7]:
ct = make_column_transformer(
        (ohe, ['Embarked', 'Sex']),
        remainder='passthrough')

Run the column transformer to test.

In [8]:
ct.fit_transform(X)

array([[ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    , 71.2833],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  0.    ,  7.925 ],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  0.    , 53.1   ],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    ,  8.05  ],
       [ 0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  8.4583],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    , 51.8625],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  1.    ,  1.    , 21.075 ],
       [ 0.    ,  0.    ,  1.    ,  1.    ,  0.    ,  2.    , 11.1333],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    , 30.0708]])

Create the logistic Regression model and assign a solver for small data sets. 

In [9]:
logreg = LogisticRegression(solver='liblinear', random_state=1)

Make a pipeline and the fit the new model and make predictions.

In [10]:
pipe = make_pipeline(ct, logreg)

In [11]:
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['Embarked', 'Sex'])],
                      

Examine the coefficients.

In [12]:
pipe.named_steps.logisticregression.coef_

array([[ 0.26491287, -0.19848033, -0.22907928,  1.0075062 , -1.17015293,
         0.20056557,  0.01597307]])

In [13]:
pipe.predict(X_new)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

Now use text data. First immport the text tranformer module, instantiate it, and vectorize the variable 'Name' into a document-test matrix. Remember that CountVectorizer expects one-dimensional input.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
dtm = vect.fit_transform(df['Name'])

Examine the feature names.

In [15]:
print(vect.get_feature_names())

['achem', 'adele', 'allen', 'berg', 'bradley', 'braund', 'briggs', 'cumings', 'elisabeth', 'florence', 'futrelle', 'gosta', 'harris', 'heath', 'heikkinen', 'henry', 'jacques', 'james', 'john', 'johnson', 'laina', 'leonard', 'lily', 'master', 'may', 'mccarthy', 'miss', 'moran', 'mr', 'mrs', 'nasser', 'nicholas', 'oscar', 'owen', 'palsson', 'peel', 'thayer', 'timothy', 'vilhelmina', 'william']


Put the data into a DataFrame. 

In [16]:
pd.DataFrame(data=dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,achem,adele,allen,berg,bradley,braund,briggs,cumings,elisabeth,florence,...,nasser,nicholas,oscar,owen,palsson,peel,thayer,timothy,vilhelmina,william
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,1,0,1,1,0,1,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
9,1,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


Update X to include 'Name'

In [17]:
cols = ['Parch','Fare','Embarked','Sex','Name']
X = df[cols]

Update the ColumnTransformer without the passthrough to examine in a DataFrame. 

In [25]:
ct = make_column_transformer(
        (ohe, ['Embarked','Sex']),
        (vect, 'Name'))

In [26]:
pd.DataFrame(data=ct.fit_transform(X).toarray(), columns=ct.get_feature_names())

Unnamed: 0,onehotencoder__x0_C,onehotencoder__x0_Q,onehotencoder__x0_S,onehotencoder__x1_female,onehotencoder__x1_male,countvectorizer__achem,countvectorizer__adele,countvectorizer__allen,countvectorizer__berg,countvectorizer__bradley,...,countvectorizer__nasser,countvectorizer__nicholas,countvectorizer__oscar,countvectorizer__owen,countvectorizer__palsson,countvectorizer__peel,countvectorizer__thayer,countvectorizer__timothy,countvectorizer__vilhelmina,countvectorizer__william
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Make column transformer and add in the *passthrough* statement.

In [27]:
ct = make_column_transformer(
        (ohe, ['Embarked','Sex']),
        (vect, 'Name'),
         remainder='passthrough')

Run the ColumnTransformer

In [19]:
ct.fit_transform(X)

<10x47 sparse matrix of type '<class 'numpy.float64'>'
	with 78 stored elements in Compressed Sparse Row format>

Now, update the pipe.

In [20]:
pipe = make_pipeline(ct, logreg)

Fit the pipeline and examine the steps. 

In [21]:
pipe.fit(X, y);
# pipe.named_steps

Update the testing data frame to include the new column. 

In [22]:
X_new = df_new[cols]
X_new

Unnamed: 0,Parch,Fare,Embarked,Sex,Name
0,0,7.8292,Q,male,"Kelly, Mr. James"
1,0,7.0,S,female,"Wilkes, Mrs. James (Ellen Needs)"
2,0,9.6875,Q,male,"Myles, Mr. Thomas Francis"
3,0,8.6625,S,male,"Wirz, Mr. Albert"
4,1,12.2875,S,female,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)"
5,0,9.225,S,male,"Svensson, Mr. Johan Cervin"
6,0,7.6292,Q,female,"Connolly, Miss. Kate"
7,1,29.0,S,male,"Caldwell, Mr. Albert Francis"
8,0,7.2292,C,female,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)"
9,0,24.15,S,male,"Davies, Mr. John Samuel"


Make predictions.

In [23]:
pipe.predict(X_new)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])