In [1]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import (
    cross_validate,
    train_test_split,
)
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

from joblib import dump


# ML model for predicting sentiment of tweets

Data can be downloaded from [Kaggle](https://www.kaggle.com/kazanova/sentiment140)

In [2]:
# Read in data
data = pd.read_csv('data/twitter_sentiment.csv')

In [3]:
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Clean up data
tweet_df = data.drop(["id", "date", "flag", "user"], axis=1)
tweet_df["target"] = pd.to_numeric(tweet_df["target"])
tweet_df = tweet_df.replace({'target': {4: 1}})

tweet_df

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,1,Just woke up. Having no school is the best fee...
1599996,1,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...


In [5]:
# Split data
train, test = train_test_split(tweet_df, test_size = 0.2, random_state=2021)
train.shape, test.shape

((1280000, 2), (320000, 2))

In [6]:
X_train, y_train = train["text"], train["target"]
X_test, y_test = test["text"], test["target"]

In [7]:
results_dict = {}

I will compare two models (logistic regression and Naive bayes model) and use the best model in the workflow for this application.

In [8]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [9]:
# Naive Bayes pipeline
pipe_nb = make_pipeline(CountVectorizer(), MultinomialNB())
results_dict["Naive Bayes"] = mean_std_cross_val_scores(
    pipe_nb, X_train, y_train, return_train_score=True
)

In [10]:
# Logistic Regression pipeline
pipe_lr = make_pipeline(TfidfVectorizer(lowercase=True, 
                                        max_features=1000, 
                                        stop_words= ENGLISH_STOP_WORDS),
                        LogisticRegression())

results_dict["Logistic Regression"] = mean_std_cross_val_scores(
    pipe_lr, X_train, y_train, return_train_score=True
)

In [11]:
pd.DataFrame(results_dict)

Unnamed: 0,Naive Bayes,Logistic Regression
fit_time,10.458 (+/- 0.177),16.289 (+/- 0.869)
score_time,2.330 (+/- 0.010),2.124 (+/- 0.029)
test_score,0.780 (+/- 0.000),0.741 (+/- 0.001)
train_score,0.835 (+/- 0.000),0.742 (+/- 0.000)


We see that the Naive Bayes model performs better than the logistic regression.

In [12]:
pipe_nb.fit(X_train, y_train)
test_tweets = ["The hard truth about the United States is that the money other countries spend on health and infrastructure, we spend on war.",
     "frank ocean is wearing rainbow colored shoes rn and a fan said they were cool and he replied 'thanks! i found them somewhere in the closet'"]
pipe_nb.predict(test_tweets)

array([0, 1])

Save the pipeline object using the joblib library

In [13]:
dump(pipe_nb, filename="tweet_classification.joblib")

['tweet_classification.joblib']