This procect's purpose is to build a machine learning model predicting sentiment of a tweet ragarding COVID-19 pandemic, using both "classical" machine learning (like logistic regression ect.) and deep learning methods.

The dataset used in this notebook comes from here: https://www.kaggle.com/datatattle/covid-19-nlp-text-classification
<br>It was collected and manually tagged by a Kaggle user named Aman Miglani.  

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
import re
from string import punctuation
import nltk
from nltk.corpus import stopwords, words
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
STOPWORDS = set(stopwords.words('english'))
ENGLISH_WORDS = set(words.words())
df_train = pd.read_csv(r"data\Corona_NLP_train.csv", encoding='latin1')
df_test = pd.read_csv(r"data\Corona_NLP_test.csv", encoding='latin1')

In [3]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,As news of the regionÂs first confirmed COVID...,Positive
6,3805,48757,"35.926541,-78.753267",16-03-2020,Cashier at grocery store was sharing his insig...,Positive
7,3806,48758,Austria,16-03-2020,Was at the supermarket today. Didn't buy toile...,Neutral
8,3807,48759,"Atlanta, GA USA",16-03-2020,Due to COVID-19 our retail store and classroom...,Positive
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"For corona prevention,we should stop to buy th...",Negative


In [4]:
print("Size of the train dataset: {}".format(df_train.shape))
print("Size of the test dataset: {}".format(df_test.shape))

Size of the train dataset: (41157, 6)
Size of the test dataset: (3798, 6)


Usually three unique sentiment values are just enough, so I will recode the target variable to such shape.

In [5]:
def recode_sentiment(y):

    if y in ['Extremely Positive', 'Positive']:
        return 'Positive'
    elif y in ['Extremely Negative', 'Negative']:
        return 'Negative'
    else:
        return 'Neutral'

In [6]:
df_train['Sentiment'] = df_train['Sentiment'].apply(lambda x: recode_sentiment(x))
df_test['Sentiment'] = df_test['Sentiment'].apply(lambda x: recode_sentiment(x))

In [7]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Negative
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,As news of the regionÂs first confirmed COVID...,Positive
6,3805,48757,"35.926541,-78.753267",16-03-2020,Cashier at grocery store was sharing his insig...,Positive
7,3806,48758,Austria,16-03-2020,Was at the supermarket today. Didn't buy toile...,Neutral
8,3807,48759,"Atlanta, GA USA",16-03-2020,Due to COVID-19 our retail store and classroom...,Positive
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"For corona prevention,we should stop to buy th...",Negative


In [8]:
def remove_url(string):
    return re.sub(r'https?://\S+|www\.\S+', '', string)

def remove_html(string):
    return re.sub(r'<.*?>', '', string)

def remove_numbers(string):
    return re.sub(r'\d+', '', string)

def remove_mentions(string):
    return re.sub(r'@\w+', '', string)

def remove_hashtags(string):
    return re.sub(r'#\w+', '', string)

def clean_data(tweet, return_tokenized=True):
    
    # Tokenization
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)
    
    cleaned_tweet = []
    
    for token, tag in pos_tag(tokens):
        
        # Cleaning tokens with regular expressions
        token = remove_url(token)
        token = remove_html(token)
        token = remove_numbers(token)
        token = remove_mentions(token)
        token = remove_hashtags(token)
        
        # Lemmatizing tokens with part of speech recognition
        
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        
        token = token.lower()
        
        if token not in punctuation and token not in STOPWORDS and token in ENGLISH_WORDS:
            cleaned_tweet.append(token)
    #TfidfVectorizer accepts strings instead of lists of tokens
    if not return_tokenized:
        cleaned_tweet = ' '.join([token for token in cleaned_tweet])

    return cleaned_tweet

In [9]:
df_train['OriginalTweet'] = df_train['OriginalTweet'].apply(lambda x: clean_data(x, return_tokenized=False))
df_test['OriginalTweet'] = df_test['OriginalTweet'].apply(lambda x: clean_data(x, return_tokenized=False))

In [10]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,,Neutral
1,3800,48752,UK,16-03-2020,advice talk family exchange phone number creat...,Positive
2,3801,48753,Vagabonds,16-03-2020,give elderly disable dedicate shopping hour am...,Positive
3,3802,48754,,16-03-2020,food stock one empty please panic enough food ...,Positive
4,3803,48755,,16-03-2020,ready go supermarket outbreak paranoid food st...,Negative
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,news first confirm covid case come county last...,Positive
6,3805,48757,"35.926541,-78.753267",16-03-2020,cashier grocery store share insight prove cred...,Positive
7,3806,48758,Austria,16-03-2020,supermarket today buy toilet paper,Neutral
8,3807,48759,"Atlanta, GA USA",16-03-2020,due covid retail store classroom open business...,Positive
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,corona prevention stop buy thing cash use paym...,Negative


In [11]:
df_train['NumberOfWords'] = df_train['OriginalTweet'].apply(lambda x: len(x.split()))
df_test['NumberOfWords'] = df_test['OriginalTweet'].apply(lambda x: len(x.split()))

In [12]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,NumberOfWords
0,3799,48751,London,16-03-2020,,Neutral,0
1,3800,48752,UK,16-03-2020,advice talk family exchange phone number creat...,Positive,22
2,3801,48753,Vagabonds,16-03-2020,give elderly disable dedicate shopping hour am...,Positive,9
3,3802,48754,,16-03-2020,food stock one empty please panic enough food ...,Positive,15
4,3803,48755,,16-03-2020,ready go supermarket outbreak paranoid food st...,Negative,14
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,news first confirm covid case come county last...,Positive,22
6,3805,48757,"35.926541,-78.753267",16-03-2020,cashier grocery store share insight prove cred...,Positive,12
7,3806,48758,Austria,16-03-2020,supermarket today buy toilet paper,Neutral,5
8,3807,48759,"Atlanta, GA USA",16-03-2020,due covid retail store classroom open business...,Positive,20
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,corona prevention stop buy thing cash use paym...,Negative,19


In [13]:
df_train = df_train.loc[df_train['NumberOfWords'] > 0,]
df_test = df_test.loc[df_test['NumberOfWords'] > 0,]

In [14]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,NumberOfWords
1,3800,48752,UK,16-03-2020,advice talk family exchange phone number creat...,Positive,22
2,3801,48753,Vagabonds,16-03-2020,give elderly disable dedicate shopping hour am...,Positive,9
3,3802,48754,,16-03-2020,food stock one empty please panic enough food ...,Positive,15
4,3803,48755,,16-03-2020,ready go supermarket outbreak paranoid food st...,Negative,14
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,news first confirm covid case come county last...,Positive,22
6,3805,48757,"35.926541,-78.753267",16-03-2020,cashier grocery store share insight prove cred...,Positive,12
7,3806,48758,Austria,16-03-2020,supermarket today buy toilet paper,Neutral,5
8,3807,48759,"Atlanta, GA USA",16-03-2020,due covid retail store classroom open business...,Positive,20
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,corona prevention stop buy thing cash use paym...,Negative,19
10,3809,48761,"Makati, Manila",16-03-2020,month crowd supermarket restaurant however red...,Neutral,16


In [15]:
print("Size of the train dataset: {}".format(df_train.shape))
print("Size of the test dataset: {}".format(df_test.shape))

Size of the train dataset: (41052, 7)
Size of the test dataset: (3792, 7)


In [16]:
df_train.drop('NumberOfWords', axis=1, inplace=True)
df_test.drop('NumberOfWords', axis=1, inplace=True)

In [17]:
y_mapping = {'Negative':0, 'Neutral':1, 'Positive':2}
df_train['Sentiment'] = df_train['Sentiment'].map(y_mapping).astype('category')
df_test['Sentiment'] = df_test['Sentiment'].map(y_mapping).astype('category')

In [18]:
df_train.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
1,3800,48752,UK,16-03-2020,advice talk family exchange phone number creat...,2
2,3801,48753,Vagabonds,16-03-2020,give elderly disable dedicate shopping hour am...,2
3,3802,48754,,16-03-2020,food stock one empty please panic enough food ...,2
4,3803,48755,,16-03-2020,ready go supermarket outbreak paranoid food st...,0
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,news first confirm covid case come county last...,2
6,3805,48757,"35.926541,-78.753267",16-03-2020,cashier grocery store share insight prove cred...,2
7,3806,48758,Austria,16-03-2020,supermarket today buy toilet paper,1
8,3807,48759,"Atlanta, GA USA",16-03-2020,due covid retail store classroom open business...,2
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,corona prevention stop buy thing cash use paym...,0
10,3809,48761,"Makati, Manila",16-03-2020,month crowd supermarket restaurant however red...,1


In [19]:
y_train, y_test = df_train['Sentiment'].copy(), df_test['Sentiment'].copy()

In [20]:
vectorizer = TfidfVectorizer(encoding='latin1', stop_words='english', min_df=5, max_features=200)
X_train_tfidf = vectorizer.fit_transform(df_train['OriginalTweet'])
X_test_tfidf = vectorizer.transform(df_test['OriginalTweet'])
X_train_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names())
X_test_tfidf = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names())

In [21]:
def evaluate_model(model, X_tr, X_val, y_tr=y_train, y_val=y_test):
    
    train_accuracy = accuracy_score( y_tr, model.predict(X_tr) )
    valid_accuracy = accuracy_score( y_val, model.predict(X_val) )
    
    return {'Train accuracy': train_accuracy, 'Validation accuracy': valid_accuracy}

In [22]:
logistic = LogisticRegression()
naive_bayes = MultinomialNB()
forest = RandomForestClassifier()

In [23]:
for model in (logistic, naive_bayes, forest):
    model.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
for model in (logistic, naive_bayes, forest):
    print('-'*40)
    print(model.__class__.__name__)
    print(evaluate_model(model=model, X_tr=X_train_tfidf, X_val=X_test_tfidf))
    print()

----------------------------------------
LogisticRegression
{'Train accuracy': 0.6339764201500536, 'Validation accuracy': 0.6215717299578059}

----------------------------------------
MultinomialNB
{'Train accuracy': 0.586987235701062, 'Validation accuracy': 0.5851793248945147}

----------------------------------------
RandomForestClassifier
{'Train accuracy': 0.9425850141284224, 'Validation accuracy': 0.5970464135021097}



In [25]:
cv = StratifiedKFold(n_splits=5)
random_forest_grid = {
    'n_estimators':[100, 200, 300],
    'max_depth':np.arange(5, 10)
}

In [33]:
forest2 = RandomForestClassifier()
forest_tune = GridSearchCV(forest2, cv=cv, param_grid=random_forest_grid,
scoring='accuracy', verbose=5)

forest_tune.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END ..................max_depth=5, n_estimators=100; total time=   2.3s
[CV 2/5] END ..................max_depth=5, n_estimators=100; total time=   2.5s
[CV 3/5] END ..................max_depth=5, n_estimators=100; total time=   2.6s
[CV 4/5] END ..................max_depth=5, n_estimators=100; total time=   2.4s
[CV 5/5] END ..................max_depth=5, n_estimators=100; total time=   2.4s
[CV 1/5] END ..................max_depth=5, n_estimators=200; total time=   4.8s
[CV 2/5] END ..................max_depth=5, n_estimators=200; total time=   4.7s
[CV 3/5] END ..................max_depth=5, n_estimators=200; total time=   4.7s
[CV 4/5] END ..................max_depth=5, n_estimators=200; total time=   4.7s
[CV 5/5] END ..................max_depth=5, n_estimators=200; total time=   4.8s
[CV 1/5] END ..................max_depth=5, n_estimators=300; total time=   7.1s
[CV 2/5] END ..................max_depth=5, n_es

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(),
             param_grid={'max_depth': array([5, 6, 7, 8, 9]),
                         'n_estimators': [100, 200, 300]},
             scoring='accuracy', verbose=5)

In [34]:
print(evaluate_model(model=forest_tune, X_tr=X_train_tfidf, X_val=X_test_tfidf))

{'Train accuracy': 0.5709587839812921, 'Validation accuracy': 0.5485232067510548}


The results aren't satisfying. Trying diffrent data preprocessing and vectorizing strategies is necessary.