# Example notebook with customer transformers

In [5]:
%pwd

'/Users/marshall.carter/Documents/my_repos/transformers'

In [6]:
%cd ..

/Users/marshall.carter/Documents/my_repos


In [8]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.ensemble import RandomForestClassifier

# Import custom libraries
from transformers.general import GetDummies
from transformers.strings import StringCleaner, StringSimilarity

### An example dataset of personal information
Lets build a simple model to predict which record pairs are matches

In [9]:
df_names = pd.DataFrame({'name_first_1': ['Marsh  all', 'Da vid', 'John', 'Bill', 'Jane'],
                         'name_first_2': ['Marshall', 'Dave', 'Johnathan', 'Andy', 'Robert'],
                         'name_last_1': ['CRTER', 'JEFFERSON', 'WILLMS', 'anderson', 'folwer'],
                         'name_last_2': ['carter', 'jefferson', 'WILL iams', 'blake', 'franklin'],
                         'address_1': ['121  Wingra Ave', '2000 American Parkway ', '1 South,  Ave', '123 Main St.', '1321 Maynard St.'],
                         'address_2': ['121, Winga Ave', '200 American Pkwy', '1 S Avenue', '2 Kailua Ave.', '4 Wilson Blvd'],
                         'phone_m': ['N','Y','Y','N','N'],
                         'match': [1,1,1,0,0]})

df_names.head()

Unnamed: 0,name_first_1,name_first_2,name_last_1,name_last_2,address_1,address_2,phone_m,match
0,Marsh all,Marshall,CRTER,carter,121 Wingra Ave,"121, Winga Ave",N,1
1,Da vid,Dave,JEFFERSON,jefferson,2000 American Parkway,200 American Pkwy,Y,1
2,John,Johnathan,WILLMS,WILL iams,"1 South, Ave",1 S Avenue,Y,1
3,Bill,Andy,anderson,blake,123 Main St.,2 Kailua Ave.,N,0
4,Jane,Robert,folwer,franklin,1321 Maynard St.,4 Wilson Blvd,N,0


In [11]:
label_col = "match"
feature_cols = [col for col in df_names.columns if col != label_col]

df_features = df_names[feature_cols]
df_label = df_names[label_col]

### Construct the feature pipeline

In [12]:
name_first_pipe = make_pipeline(StringCleaner("name"),
                                StringSimilarity(["jaro_winkler"]))


name_last_pipe = make_pipeline(StringCleaner("name"),
                               StringSimilarity(["jaro_winkler"]))


address_pipe = make_pipeline(StringCleaner("name"),
                             StringSimilarity(["jaro_winkler", "token_set"]))


phone_pipe = make_pipeline(GetDummies())


preprocessor = ColumnTransformer(transformers=[
                                   ('first_names',   name_first_pipe, ["name_first_1", "name_first_2"]),
                                   ('last_names',    name_last_pipe, ["name_last_1", "name_last_2"]),
                                   ('addresses',     address_pipe, ["address_1", "address_2"]),
                                   ('phone_numbers', phone_pipe, ["phone_m"])])


clf = RandomForestClassifier(n_estimators = 50, n_jobs=-1)

model_pipeline = make_pipeline(preprocessor, clf)

#### Model cross validation score

In [13]:
# On a real-world dataset, which would be much bigger, a larger number
# of folds would be used for cross validation
model_score = cross_val_score(model_pipeline, df_features, df_label, cv=2, scoring='f1')
model_score.mean()

1.0

#### The model feature vector

In [14]:
preprocessor.fit_transform(df_features)

array([[ 98,  95,  98,  96,   0],
       [ 80, 100,  92,  89,   1],
       [ 89,  93,  76,  67,   1],
       [  0,  55,  62,  35,   0],
       [ 47,  53,  44,  29,   0]])

#### Using the model for prediction
A great feature of scikit-learn pipelines is that all the feature engineering and prediction are performed by a single method call. This greatly simplifies model deployment.

In [15]:
fitted_model = model_pipeline.fit(df_features, df_label)

fitted_model.predict(df_features)

array([1, 1, 1, 0, 0])