In [1]:
import mlflow.sklearn
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import tree
import nltk.data
nltk.download('stopwords')
from nltk.corpus import stopwords

### PreProcess Data

In [2]:
train_val = pd.read_csv('train_val_tweets.csv')
texts = train_val['tweet'].values
target = train_val['label'].values
texts_train, texts_test, y_train, y_test = train_test_split(texts, target, test_size=0.2, random_state=42)


def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + \
           ' '.join(emoticons).replace('-', '')
    tokenizer = TweetTokenizer()
    text = ' '.join(tokenizer.tokenize(text.lower()))
    return text


texts_train = [preprocessor(t) for t in texts_train]
texts_test = [preprocessor(t) for t in texts_test]


### Initialize all models

In [3]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import tree
import nltk.data

nltk.download('stopwords')
from nltk.corpus import stopwords


tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
tokenizer = TweetTokenizer()
stop = stopwords.words('english')
lr_clf = LogisticRegression(random_state=42)
sgd_clf = SGDClassifier(loss='log_loss', random_state=42)
rf_clf = RandomForestClassifier(n_estimators=20, random_state=42)
xgb_clf = XGBClassifier(random_state=42)
tree_clf = tree.DecisionTreeClassifier(random_state=42)
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None]}]


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bharani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Find best model

In [4]:
all_classifiers = {'lr': lr_clf,
                   'sgd': sgd_clf,
                   'rf': rf_clf,
                   'xgb': xgb_clf,
                   'tree': tree_clf,
                   }

best_models = {}
accuracy_dict = {}
for clf_name, clf in all_classifiers.items():
    with mlflow.start_run():
        # assert run.info.experiment_id == EXP_ID
        print(f"{clf_name} created successfully".upper())
        tfidf_clf_pipe = Pipeline([('vect', tfidf), ('clf', clf)])
        tfidf_clf_pipe_gs = GridSearchCV(tfidf_clf_pipe,
                                         param_grid,
                                         scoring='accuracy',
                                         verbose=0,
                                         cv=3,
                                         n_jobs=-1)
        tfidf_clf_pipe_gs.fit(texts_train, y_train)
        best_model = tfidf_clf_pipe_gs.best_estimator_
        best_models[clf] = best_model
        y_pred = best_model.predict(texts_test)
        acc = accuracy_score(y_test, y_pred)
        accuracy_dict[clf_name] = acc
        print('classifier algorithm = %s' % clf_name)
        print("Number of mislabeled points out of a total %d points : %d" % (
            len(texts_test), (y_test != y_pred).sum()))
        print('Test Accuracy: %.3f' % acc)

LR CREATED SUCCESSFULLY
classifier algorithm = lr
Number of mislabeled points out of a total 6393 points : 333
Test Accuracy: 0.948
SGD CREATED SUCCESSFULLY
classifier algorithm = sgd
Number of mislabeled points out of a total 6393 points : 395
Test Accuracy: 0.938
RF CREATED SUCCESSFULLY
classifier algorithm = rf
Number of mislabeled points out of a total 6393 points : 277
Test Accuracy: 0.957
XGB CREATED SUCCESSFULLY
classifier algorithm = xgb
Number of mislabeled points out of a total 6393 points : 318
Test Accuracy: 0.950
TREE CREATED SUCCESSFULLY
classifier algorithm = tree
Number of mislabeled points out of a total 6393 points : 355
Test Accuracy: 0.944


### Load the best model

In [6]:
# load the model
final_model = pickle.load(open('best_pipe.pkl', 'rb'))
transformer = pickle.load(open('tfidf.pkl', 'rb'))

### Predict

In [11]:
def classify(tweet):
    label = {0: 'NON TOXIC', 1: 'TOXIC'}
    X = transformer.transform([preprocessor(t) for t in [tweet]])
    y = final_model.predict(X)[0]
    proba = np.max(final_model.predict_proba(X))
    return label[y], proba

In [12]:
classify("@user #cnn calls #michigan middle school 'build the wall' chant '' #tcot")

('TOXIC', 0.95)

In [13]:
classify("its #friday! ð smiles all around via ig user: @user #cookies make people")

('NON TOXIC', 1.0)