# Model Performance Analyzer

We review several models with respect to their performance outputs for a simple classification problem: 
Given a message sent to a participant, what is the probability of a getting back a response? 
The following models have been tested in this notebook and their accuracy and roc_auc_scores recorded and plotted for comparisons.
    1. Logistic regression using edd, language, char and words
    2. Gradient Boosting classifier using edd, language, char and words
    3. Logistic regression using bow only
    4. Gradient Boosting classifier using bow only
    5. Logistic regression using bow, edd, & language
    6. Gradient Boosting classifier using bow, edd, & language
    7. Logistic regression using bi-grams only
    8. Gradient Boosting classifier using bi-grams only
    9. Logistic regression using bi-grams, edd, & language
    10. Gradient Boosting classifier using bi-grams, edd, & language
    11. Logistic regression using word2vec only
    12. Gradient boosting using word2vec only
    13. Logistic regression using word2vec, edd, & language only
    14. Gradient boosting using word2vec, edd, & language only
    15. Logistic regression using doc2vec only
    16. Gradient boosting using doc2vec only
    17. Logistic regression using doc2vec, edd, & language only
    18. Gradient boosting using doc2vec, edd, & language only
 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedKFold
import sklearn.ensemble
import statistics
pd.set_option('display.max_rows', 4)

In [2]:
messages_df = pd.read_excel("C:/Users/user/UW/keshet - mwach_interaction/SMSdata/mwachNeo/allSMSdatabase/messages_export_2019-03-05.xlsx")
messages_df.shape

(59506, 19)

In [3]:
# Remove nurse responses from these messages.
messages_df = messages_df[messages_df['sent_by']!='nurse']
messages_df.shape

(45971, 19)

In [4]:
messages_df

Unnamed: 0,mid,pid,day,timestamp,auto,external,delta_human,delta,delta_last,study_wk,edd_wk,chars,words,topic,related,sent_by,language,translated,original
0,808,15,Tue,2017-12-05 06:36:59.614,signup.two-way.normal.0,Success,,,,0.000000,-9.000000,250,42,,,system,swahili,Welcome to Mobile WACh NEO! Good job coming in...,Karibu kwa Mobile WACh NEO. Kazi nzuri kwa kuj...
1,859,15,Tue,2017-12-05 06:55:45.982,,,18m,1126.367329,,0.000000,-9.000000,30,4,validation,1.0,participant,english,,Validation Code Correct: 13514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59503,56257,800,Wed,2018-07-11 17:05:27.065,edd.two-way.normal.10,Failed,,,,8.142857,1.428571,168,24,41.5 weeks,,system,swahili,"{name}, this is {nurse} from {clinic}. Have yo...","Linet, huyu ni Mercy kutoka Rachuonyo. Umejifu..."
59504,56843,800,Sun,2018-07-15 17:05:23.401,edd.two-way.normal.14,Failed,,,,8.714286,2.000000,252,41,42 weeks,,system,swahili,"{name}, this is {nurse} from {clinic} Bad swel...","Linet, huyu ni Mercy kutoka Rachuonyo. kufura ..."


In [5]:
messages_df['system_msg'] = (messages_df['sent_by']=='system')
print("Number of System Messages: ", messages_df['system_msg'].sum())

Number of System Messages:  26021


In [37]:
#We need to know number of responses each system message received. --- we will come back later to remove nurse messages
messages_df['msg_group'] = messages_df.system_msg.cumsum()
messages_df.msg_group.value_counts()

7115     114
24446     23
        ... 
9385       1
2047       1
Name: msg_group, Length: 26021, dtype: int64

In [38]:
#Our responses vector is all those messages with a value count of more than 1
response = (messages_df.msg_group.value_counts() > 1)

In [40]:
systemMessagesDF = messages_df[messages_df.sent_by == "system"]
systemMessagesDF = systemMessagesDF.set_index('msg_group')
systemMessagesDF['response'] = response
systemMessagesDF

Unnamed: 0_level_0,mid,pid,day,timestamp,auto,external,delta_human,delta,delta_last,study_wk,...,chars,words,topic,related,sent_by,language,translated,original,system_msg,response
msg_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,808,15,Tue,2017-12-05 06:36:59.614,signup.two-way.normal.0,Success,,,,0.000000,...,250,42,,,system,swahili,Welcome to Mobile WACh NEO! Good job coming in...,Karibu kwa Mobile WACh NEO. Kazi nzuri kwa kuj...,True,True
2,1926,15,Tue,2017-12-12 17:05:02.999,edd.two-way.normal.-56,Success,,,,1.000000,...,428,65,32 weeks,,system,swahili,"{name}, this is {nurse} from {clinic}. Sometim...","Mama Lucas, huyu ni Brenda kutoka Mathare. Wak...",True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26020,56257,800,Wed,2018-07-11 17:05:27.065,edd.two-way.normal.10,Failed,,,,8.142857,...,168,24,41.5 weeks,,system,swahili,"{name}, this is {nurse} from {clinic}. Have yo...","Linet, huyu ni Mercy kutoka Rachuonyo. Umejifu...",True,False
26021,56843,800,Sun,2018-07-15 17:05:23.401,edd.two-way.normal.14,Failed,,,,8.714286,...,252,41,42 weeks,,system,swahili,"{name}, this is {nurse} from {clinic} Bad swel...","Linet, huyu ni Mercy kutoka Rachuonyo. kufura ...",True,False


In [67]:
%%timeit
#calculate past response rate of system message (did this participant respond to the last message?)
#let loop through the messages and check current participant has a previous system message with a response
#for mid, pid, has_response from systemMesaagesDF:
def past_response(mid2, pid2):
    has_prev_response = 0
    #get prev system message to this participant
    # last system message from this participant before the current one
    last_msg = systemMessagesDF[(systemMessagesDF.pid == pid2) & (systemMesaagesDF.mid < mid2) ].tail(1) 
    
    if(len(last_msg) > 0):
        return last_msg.iloc[0]['response']
    else:
        return False
    
    
#past_response(808, 15)
systemMessagesDF['past_response'] = systemMessagesDF.apply(lambda row: past_response(row.mid, row['pid']), axis=1)

1min ± 902 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [76]:
pd.set_option('display.max_rows', 4)
systemMessagesDF

Unnamed: 0_level_0,mid,pid,day,timestamp,auto,external,delta_human,delta,delta_last,study_wk,...,related,sent_by,language,translated,original,system_msg,response,distance,past_response,native_langauge
msg_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,808,15,Tue,2017-12-05 06:36:59.614,signup.two-way.normal.0,Success,,,,0.000000,...,,system,swahili,Welcome to Mobile WACh NEO! Good job coming in...,Karibu kwa Mobile WACh NEO. Kazi nzuri kwa kuj...,True,True,False,False,True
2,1926,15,Tue,2017-12-12 17:05:02.999,edd.two-way.normal.-56,Success,,,,1.000000,...,,system,swahili,"{name}, this is {nurse} from {clinic}. Sometim...","Mama Lucas, huyu ni Brenda kutoka Mathare. Wak...",True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26020,56257,800,Wed,2018-07-11 17:05:27.065,edd.two-way.normal.10,Failed,,,,8.142857,...,,system,swahili,"{name}, this is {nurse} from {clinic}. Have yo...","Linet, huyu ni Mercy kutoka Rachuonyo. Umejifu...",True,False,False,False,True
26021,56843,800,Sun,2018-07-15 17:05:23.401,edd.two-way.normal.14,Failed,,,,8.714286,...,,system,swahili,"{name}, this is {nurse} from {clinic} Bad swel...","Linet, huyu ni Mercy kutoka Rachuonyo. kufura ...",True,False,False,False,True


In [77]:
#check for native langauge.
systemMessagesDF['native_langauge'] = np.where(systemMessagesDF['language'] != 'english', True, False)
systemMessagesDF.loc[:,['language','native_langauge'] ]

Unnamed: 0_level_0,language,native_langauge
msg_group,Unnamed: 1_level_1,Unnamed: 2_level_1
1,swahili,True
2,swahili,True
...,...,...
26020,swahili,True
26021,swahili,True


In [88]:
#Allocate features now.
systemMessagesDF['swahili'] = systemMessagesDF.language == 'swahili'
systemMessagesDF['english'] = systemMessagesDF.language == 'english'
systemMessagesDF['luo'] = systemMessagesDF.language == 'luo'
systemMessagesDF['delivery_status'] = systemMessagesDF.external == 'Success'

In [89]:
#Let's now filter the columns and convert them to float.
X = systemMessagesDF.filter(['edd_wk','study_wk', 'past_response', 'native_langauge', 'swahili', 'english', 'luo','delivery_status']).astype(float)
y = systemMessagesDF.response.astype(float)

In [90]:
#Normalize X
from sklearn.preprocessing import normalize as normalize
X = normalize(X, norm='l2', axis=0, copy=False, return_norm=False)
X

array([[-0.00886271,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.00650462],
       [-0.00787796,  0.00051463,  0.00966872, ...,  0.        ,
         0.        ,  0.00650462],
       [-0.00689322,  0.00102925,  0.00966872, ...,  0.        ,
         0.        ,  0.00650462],
       ...,
       [ 0.00098475,  0.00396997,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00140678,  0.00419053,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00196949,  0.0044846 ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])