Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from nltk.corpus import stopwords
from nltk import word_tokenize, regexp_tokenize, FreqDist
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import string
import re

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import plot_confusion_matrix, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')

Text classification function for model creation

In [None]:
def text_classification(model):
    """
    Takes in a scikit-learn classifier and runs a model
    based on the training data provided. Returns cross-validated
    scores and an average score rating the performance of the model.
    Also returns a confusion matrix as a visual representation
    of the model's accuracy.
    """
    tfidf_vectorizer = TfidfVectorizer()
    smote = SMOTE(sampling_strategy='not majority')
    
    pipeline = make_pipeline(tfidf_vectorizer, smote, model)
    
    scores = cross_val_score(pipeline, X_train, y_train)
    print('Cross-validated scores:', scores.round(3))
    print('Average CV score:', np.mean(scores).round(3))
    
    pipeline.fit(X_train, y_train)
    y_preds = pipeline.predict(X_test)
    print('Recall score:', recall_score(y_test, y_preds, average='micro').round(3))
    print('F1 score:', f1_score(y_test, y_preds, average='micro').round(3))
    
    plot_confusion_matrix(pipeline, X_test, y_test)
    
    return pipeline