# ML Pipeline Preparation
### 1. Import libraries and load data from database.

In [1]:
# import libraries
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger','stopwords'])
import time
import pickle
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kamil.wiktorski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kamil.wiktorski/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kamil.wiktorski/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kamil.wiktorski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#load data from database

df = pd.read_sql_table('Messages_Categories', 'sqlite:///ETLData.db')
drop_var = ['id','message','original','genre']

y_var = list(set(df.columns) - set(drop_var) -set('message'))
X = df['message'].values
Y = df[y_var].values

#remove Y values where only more than 1 classes
y_unique = df[y_var].nunique()

Y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### 2. Tokenization

In [3]:
text_only = r'[^A-Za-z]'
url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def tokenize(text):   
    detected_urls = re.findall(url_regex,text)
    for url in detected_urls:
        text = text.replace(url,"urlplaceholder")
    
    stopw = stopwords.words('english') + ['urlplaceholder']
    text = re.sub(text_only," ",text)
    
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    tokens_wo_stopwords = [i for i in tokens if i not in stopw]
    
    clean_tokens = []
    for tok in tokens_wo_stopwords:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. MLpipeline

In [4]:
vect = CountVectorizer(tokenizer=tokenize)
vect.fit_transform(X)

<26216x29753 sparse matrix of type '<class 'numpy.int64'>'
	with 346519 stored elements in Compressed Sparse Row format>

In [5]:
[(k,v) for k,v in vect.vocabulary_.items() if 'weather' in k][:20]

[('weather', 28834),
 ('tweathergeek', 27407),
 ('birthdayweatherprobs', 2840),
 ('weathered', 28835),
 ('weatherproof', 28837),
 ('weatherman', 28836),
 ('weatherproofing', 28838)]

In [6]:
pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize, lowercase=True)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(RandomForestClassifier(class_weight="balanced")))
            ])

### 4. Train pipeline
- Splitting data into train and test sets
- Training pipeline

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=654, test_size=0.25)
model = pipeline
model.fit(X_train, Y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7f8be8932700>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(class_weight='balanced')))])

### 5. Testing model

In [8]:
Y_pred = model.predict(X_test)
len(Y_pred)

6554

In [9]:
for column, test, pred in zip(y_var, Y_test.T, Y_pred.T):
    print(column, classification_report(test, pred))

  _warn_prf(average, modifier, msg_start, len(result))


shops               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6525
           1       0.00      0.00      0.00        29

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      6554

request               precision    recall  f1-score   support

           0       0.91      0.97      0.94      5379
           1       0.82      0.54      0.65      1175

    accuracy                           0.90      6554
   macro avg       0.86      0.76      0.80      6554
weighted avg       0.89      0.90      0.89      6554

electricity               precision    recall  f1-score   support

           0       0.98      1.00      0.99      6433
           1       0.80      0.03      0.06       121

    accuracy                           0.98      6554
   macro avg       0.89      0.52      0.53      6554
weighted avg       0.98      0.98      0.97     

Classification_report shows several importan metrics that would be helpful in multioutpit classifier.

Why accuracy score is not good for this model?

In [10]:
(Y_pred == Y_test).mean()

0.9470128505069

Accuracy score for predicted model seems very high at **94.7%**

In [11]:
(np.zeros(np.shape(Y_test)) == Y_test).mean()

0.9107415318889228

However  simulating all datapoints to be 0 (negative predcition) we receive **91.0%** score. This is due to imbalanced number of positive cases and the fact we proceed with multiple label classifier. 

The more proper score evalution will be F1-score.

In [12]:
def test_model(Y_test, Y_pred, average_type=None):
    results = dict()
    # classification_report is only printing a string which is unredible, using precision_recall_fscore_support instead
    for col in zip(Y_test.T, Y_pred.T, y_var):  
        y_test, y_pred, var_name = col[0], col[1], col[2]
        class_report=precision_recall_fscore_support(y_test, y_pred, average=average_type)
        d = {   
            'precision': class_report[0],
            'recall': class_report[1],
            'f1-score': class_report[2],
            'support': class_report[3]
            }
        results[var_name] = d
        
    return pd.DataFrame.from_dict(results, orient='index').drop('support', axis=1)#[0].apply(lambda x: pd.Series(x))
        

def test_model_mean(Y_test, Y_pred):
    return test_model(Y_test, Y_pred).mean()
    
from sklearn.metrics import make_scorer

def get_model_mean(Y_test, Y_pred):
    return test_model(Y_test, Y_pred, 'macro').mean()['f1-score']

custom_loss = make_scorer(get_model_mean, greater_is_better=True)


    

In [13]:
test_model(Y_test, Y_pred, 'macro')

Unnamed: 0,precision,recall,f1-score
shops,0.497788,0.5,0.498891
request,0.864186,0.756627,0.795037
electricity,0.891067,0.516451,0.527201
child_alone,1.0,1.0,1.0
water,0.917869,0.652459,0.715302
offer,0.497787,0.499847,0.498815
missing_people,0.494355,0.5,0.497161
tools,0.497406,0.5,0.4987
search_and_rescue,0.485878,0.499686,0.492685
cold,0.990304,0.518939,0.531601


In [14]:
test_model(Y_test, Y_pred, 'macro').describe()


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score
count,36.0,36.0,36.0
mean,0.753478,0.593247,0.60634
std,0.173968,0.129018,0.141211
min,0.485878,0.499686,0.484388
25%,0.588239,0.501318,0.498315
50%,0.803714,0.522596,0.533803
75%,0.891637,0.661022,0.719054
max,1.0,1.0,1.0


Average F1-score among all classes with using the first model is **60.6%** and each class scores span from 48.7% to 75.2%

In [15]:
test_model(Y_test, np.zeros(Y_test.shape), 'macro').describe()#.boxplot()


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score
count,36.0,36.0,36.0
mean,0.46818,0.509259,0.484769
std,0.119397,0.088591,0.109348
min,0.077713,0.333333,0.126041
25%,0.45497,0.5,0.476423
50%,0.476274,0.5,0.487849
75%,0.491189,0.5,0.495555
max,1.0,1.0,1.0


Had we calculated score with placing only zero class as predicted the F1-score is around **48.5%** with the span of 10.9% to 49.6%



### 6. Improving model with GridSearch


In [17]:
start = time.time()

# those paraemters did not bring improvement
# parameters = {
#     'vect__stop_words': ['english'],
#     'vect__ngram_range': [(1,2), (1,3), (2,3)] 
# } 

parameters = {
        'vect__ngram_range': [(1,1), (1,2)],
        'clf__estimator__class_weight': [None,'balanced']
    } 

cv = GridSearchCV(pipeline, param_grid=parameters, scoring=custom_loss, verbose=True)
cv.fit(X_train, Y_train)
end = time.time()

print('time taken = {}s'.format(end - start))
print(cv.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_star

time taken = 8193.915993213654s
{'clf__estimator__class_weight': None, 'vect__ngram_range': (1, 1)}


### 7. Testing improved model

In [19]:
Y_pred_CV = cv.predict(X_test)
cv.best_params_


{'clf__estimator__class_weight': None, 'vect__ngram_range': (1, 1)}

In [20]:
test_model(Y_test, Y_pred_CV, 'macro')

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score
shops,0.497788,0.5,0.498891
request,0.871084,0.739988,0.782916
electricity,0.990844,0.504132,0.503576
child_alone,1.0,1.0,1.0
water,0.940675,0.680958,0.749439
offer,0.497788,0.5,0.498891
missing_people,0.494355,0.5,0.497161
tools,0.497406,0.5,0.4987
search_and_rescue,0.843396,0.513357,0.518996
cold,0.955188,0.549165,0.584412


In [21]:
test_model(Y_test, Y_pred_CV, 'macro').describe()

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score
count,36.0,36.0,36.0
mean,0.775108,0.604352,0.620596
std,0.164429,0.13361,0.144462
min,0.490382,0.499689,0.486675
25%,0.707725,0.503873,0.498891
50%,0.825997,0.542459,0.565342
75%,0.882662,0.682823,0.745302
max,1.0,1.0,1.0


In [22]:
cv.cv_results_

{'mean_fit_time': array([199.282586  , 447.72332702, 232.25124688, 659.60427208]),
 'std_fit_time': array([ 1.3025753 , 10.26836782,  2.58325328, 52.39909901]),
 'mean_score_time': array([ 9.90824642, 15.22063718,  7.63748269, 10.88952551]),
 'std_score_time': array([0.18034635, 0.30755399, 0.06285847, 0.51306744]),
 'param_clf__estimator__class_weight': masked_array(data=[None, None, 'balanced', 'balanced'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vect__ngram_range': masked_array(data=[(1, 1), (1, 2), (1, 1), (1, 2)],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__estimator__class_weight': None,
   'vect__ngram_range': (1, 1)},
  {'clf__estimator__class_weight': None, 'vect__ngram_range': (1, 2)},
  {'clf__estimator__class_weight': 'balanced', 'vect__ngram_range': (1, 1)},
  {'clf__estimator__class_weight': 'balanced', 'vect__ngram_range': (1, 2

Improved model reaches f1-score = **62.0%** with particular scores ranging from 48.7% to 100.0%.

### 8. Other algorithms
- Testing Support Vector Machine and Naive Bayes instead of Random Forest


#### 8.1 Other ML algorithms

##### SVM

-- long training time, no particular gain

In [None]:
# pipeline_SVM = Pipeline([
#                 ('vect', CountVectorizer(tokenizer=tokenize)),
#                 ('tfidf', TfidfTransformer()),
#                 ('scaler',StandardScaler(with_mean=False)),
#                 ('svm', MultiOutputClassifier(svm.SVC()))
#             ])

In [None]:
# pipeline_SVM.fit(X_train, Y_train)


In [None]:
# Y_pred_SVM = pipeline_SVM.predict(X_test)


In [None]:
# test_model(Y_test, Y_pred_SVM, 'macro').describe()

#### Naive Bayes

In [24]:
pipeline_NB = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                ('NB', MultiOutputClassifier(MultinomialNB()))
            ])
pipeline_NB.fit(X_train, Y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7f8be8932700>)),
                ('tfidf', TfidfTransformer()),
                ('NB', MultiOutputClassifier(estimator=MultinomialNB()))])

In [25]:
Y_pred_NB = pipeline_NB.predict(X_test)


In [26]:
test_model(Y_test, Y_pred_NB, 'macro')

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score
shops,0.497788,0.5,0.498891
request,0.8481,0.606935,0.634925
electricity,0.490769,0.5,0.495341
child_alone,1.0,1.0,1.0
water,0.466046,0.499918,0.482388
offer,0.497788,0.5,0.498891
missing_people,0.494355,0.5,0.497161
tools,0.497406,0.5,0.4987
search_and_rescue,0.485886,0.5,0.492842
cold,0.48993,0.5,0.494914


In [27]:
test_model(Y_test, Y_pred_NB, 'macro').describe()

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score
count,36.0,36.0,36.0
mean,0.573214,0.529401,0.523179
std,0.156085,0.10001,0.106355
min,0.432824,0.364349,0.349645
25%,0.477723,0.5,0.486504
50%,0.493172,0.5,0.494368
75%,0.645777,0.500959,0.498748
max,1.0,1.0,1.0


### Best model

Best results achieved with tuned initial pipeline i.e. using
- CountVectorizer with custom tokenizer an 1-grams
- TfIDF Transformer
- Random Forest classifier with equally weighted classes

1-grams in Vectorizer and not weighted class in estimator.

In [29]:
# Saving Best Model
pickle.dump(cv, open('Classifier_Jupyter', 'wb'))