# Deploying machine learning models 

Deployed 2 machine learning algorithms- Logistic Regression and Naive Bayes and compared them on the basis of accuracy and f1-score.

In [7]:
# importing libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [8]:
# reading preprocessed train and test data

train = pd.read_csv('train_preprocessed.csv')
test = pd.read_csv('test_preprocessed.csv')

In [9]:
# preparing x and y for fitting into model

x = train['comment_text'].values.astype('U')
          
y = []
y = pd.DataFrame(y)
          
y['toxic'] = train['toxic']
y['severe_toxic'] = train['severe_toxic']
y['obscene'] = train['obscene']
y['threat'] = train['threat']
y['insult'] = train['insult']
y['identity_hate'] = train['identity_hate']
          
y

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,1,0,1,0,1,0
4,1,1,1,0,1,0
...,...,...,...,...,...,...
159566,1,1,1,0,1,1
159567,1,0,0,0,0,0
159568,1,0,0,0,1,1
159569,1,0,0,0,1,0


### Vectorizing the text data using TF-IDF vectorizer

Machine learning algorithms cannot work on the raw text directly. So, feature extraction is used to convert the text into a matrix of vectors of features. 
I have used TF-IDF(term frequency-inverse document frequency) for the same on the text data.

In [10]:
vec = TfidfVectorizer(max_features=5000,stop_words='english')
vec

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [11]:
# Splitting into train and test

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 233)

# fitting the vectorizer in train and test text data
x_vec = vec.fit_transform(X_train)
x_vec_test = vec.transform(X_test)

# fitting in the final test data
test = vec.transform(test['comment_text'].values.astype('U'))


In [63]:
# defining labels in a list
labels = y.columns
labels

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

### Defining Logistic Regression model
Tried on differnt values of C = 1.0, 3.0, 5.0, 12.0, took that gave the best accuracy and F1 score.

In [31]:
def logisticreg(X_train, X_test, y_train, y_test, test):
    
    logreg = LogisticRegression(C=3.0, max_iter=1000) 
        
    logreg.fit(X_train, y_train)
    
    y_pred = logreg.predict(X_test)
    
    model_logreg.append(logreg.predict_proba(test)[:,1]) #predicting probability for final test data
    
    return accuracy_score(y_test,y_pred), f1_score(y_test,y_pred)

In [29]:
model_logreg = []  #list for containing probability of labels on test data.

for i in range(0,6) :
    
    print("For label:",labels[i])
    
    acc, f1 = logisticreg(x_vec, x_vec_test, y_train[labels[i]], y_test[labels[i]], test)
 
    print("Accuracy-",acc)
    print("F1-score-",f1)
    print("\n")

For label: toxic
Accuracy- 0.9574808083973053
F1-score- 0.7464972912385578


For label: severe_toxic
Accuracy- 0.9899733667554441
F1-score- 0.3725490196078432


For label: obscene
Accuracy- 0.9783487388375373
F1-score- 0.7709645343056016


For label: threat
Accuracy- 0.9971486761710794
F1-score- 0.2352941176470588


For label: insult
Accuracy- 0.9712047626507911
F1-score- 0.6639853747714808


For label: identity_hate
Accuracy- 0.9922920256932477
F1-score- 0.3910891089108911




In [44]:
#predicted probabilities of each label of every comment of naive bayes model

model_logreg

[array([0.99749107, 0.00379326, 0.01144098, ..., 0.00295491, 0.01399935,
        0.99228101]),
 array([0.13117531, 0.00175925, 0.00112061, ..., 0.00088052, 0.00223239,
        0.00172593]),
 array([0.99797392, 0.0014692 , 0.00174237, ..., 0.00438719, 0.01677793,
        0.94202003]),
 array([0.06389543, 0.00046256, 0.00012203, ..., 0.00058336, 0.00222849,
        0.0034816 ]),
 array([0.95462112, 0.00871706, 0.0022137 , ..., 0.00211739, 0.02075944,
        0.55966147]),
 array([0.25381916, 0.00226104, 0.00033727, ..., 0.00025751, 0.0271829 ,
        0.00769689])]

In [50]:
# submitting the predicted probabilities of logistic regression model

submission_logreg = pd.read_csv('sample_submission.csv')
submission_logreg.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [61]:
submission_logreg['toxic'] = model_logreg[0]
submission_logreg['severe_toxic'] = model_logreg[1]
submission_logreg['obscene'] = model_logreg[2]
submission_logreg['threat'] = model_logreg[3]
submission_logreg['insult'] = model_logreg[4]
submission_logreg['identity_hate'] = model_logreg[5]

submission_logreg.to_csv('submission_logreg.csv')  # saving the submissions
submission_logreg.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997491,0.131175,0.997974,0.063895,0.954621,0.253819
1,0000247867823ef7,0.003793,0.001759,0.001469,0.000463,0.008717,0.002261
2,00013b17ad220c46,0.011441,0.001121,0.001742,0.000122,0.002214,0.000337
3,00017563c3f7919a,0.002199,0.00121,0.001913,0.000207,0.002106,0.000229
4,00017695ad8997eb,0.012713,0.000351,0.002811,0.000617,0.003881,0.000298


### Defining Naive Bayes model

In [54]:
def nb_classifier(X_train, X_test, y_train, y_test, test):
  
    nb = MultinomialNB(alpha = 1.0)
    
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)
    
    model_nb.append(nb.predict_proba(test)[:,1]) #predicting probability for final test data

    return accuracy_score(y_test,y_pred), f1_score(y_test,y_pred)

In [55]:
model_nb = []   #saving models of each label in a list for further use

for l in range(0,6) :
    print("For label-",labels[l])
    
    acc, f1 = nb_classifier(x_vec, x_vec_test, y_train[labels[l]], y_test[labels[l]], test)
  
    print("Accuracy-",acc)
    print("F1-score-",f1)
    print("\n")

For label- toxic
Accuracy- 0.9493968353438822
F1-score- 0.657476139978791


For label- severe_toxic
Accuracy- 0.9896287012376626
F1-score- 0.19854721549636806


For label- obscene
Accuracy- 0.9723014256619145
F1-score- 0.6733185513673318


For label- threat
Accuracy- 0.996960676797744
F1-score- 0.0


For label- insult
Accuracy- 0.9675387748707505
F1-score- 0.5785191212367778


For label- identity_hate
Accuracy- 0.991540028199906
F1-score- 0.1176470588235294




In [57]:
#predicted probabilities of each label of every comment of naive bayes model
model_nb

[array([0.98545498, 0.01653995, 0.03079137, ..., 0.0067458 , 0.03861701,
        0.64189787]),
 array([0.15070855, 0.00145118, 0.00095912, ..., 0.00146327, 0.00307092,
        0.00696372]),
 array([0.96109403, 0.00778984, 0.01273684, ..., 0.00339215, 0.01858582,
        0.3296341 ]),
 array([0.0059854 , 0.00040092, 0.00014569, ..., 0.00087884, 0.0024659 ,
        0.00318041]),
 array([0.92439506, 0.00963679, 0.01034718, ..., 0.00312236, 0.02029692,
        0.25974141]),
 array([0.1367684 , 0.00169943, 0.00080837, ..., 0.00143969, 0.02067246,
        0.01107733])]

In [58]:
# submitting the predicted probabilities of logistic regression model

submission_nb = pd.read_csv('sample_submission.csv')
submission_nb.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [66]:
submission_nb['toxic'] = model_nb[0]
submission_nb['severe_toxic'] = model_nb[1]
submission_nb['obscene'] = model_nb[2]
submission_nb['threat'] = model_nb[3]
submission_nb['insult'] = model_nb[4]
submission_nb['identity_hate'] = model_nb[5]

submission_nb.to_csv('submission_nb.csv')    # saving the submissions
submission_nb.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.985455,0.150709,0.961094,0.005985,0.924395,0.136768
1,0000247867823ef7,0.01654,0.001451,0.00779,0.000401,0.009637,0.001699
2,00013b17ad220c46,0.030791,0.000959,0.012737,0.000146,0.010347,0.000808
3,00017563c3f7919a,0.00682,0.000271,0.003074,4.5e-05,0.002883,0.00018
4,00017695ad8997eb,0.039287,0.000608,0.016786,0.000129,0.013979,0.00055


## Comparison

On comapring Logistic Regression model and Naive Bayes model on the basis of accuracy and f1-score, we observe that Logistic Regression outperformed with better accuracy as well as f1-score for each label. 

Although labels sever_toxic, threat and identity_hate have high accuracy but they have low f1-score (< 0.5) which indicates that models for these labels does not perform good on False Negatives or False Positives.