# Part 3: Mining Text Data

This part uses the Coronavirus Tweets NLP data set from Kaggle https://www.kaggle.com/ datatattle/covid-19-nlp-text-classification to predict the sentiment of Tweets relevant to Covid. The data set (Corona NLP test.csv file) contains 6 attributes:

    UserName: Anonymized attribute
    ScreenName: Anonymized attribute
    Location: Location of the person having made the tweet
    TweetAt: Date
    OriginalTweet: Textual content of the tweet
    Sentiment: Emotion of the tweet

Because this is a quite big data set, existing vectorized (pandas) functions have been particularly useful to effectively perform the various tasks with a typical personal computer. In this way, you will be able to run the code in few seconds. Otherwise, running the code might require a significant amount of time, e.g. in the case where for-loops are used for accessing all elements of the data set.

You need to install the nltk package to run the following code. You can do so by running the following command in your terminal:

    pip install nltk

In [1]:
import numpy as np
import pandas as pd
# import re
import urllib
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import sklearn.naive_bayes as nb
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

# Return a pandas dataframe containing the data set.
# Specify a 'latin-1' encoding when reading the data.
# data_file will be populated with the string 'wholesale_customers.csv'.
def read_csv_3(data_file):
    # read csv file using Pandas
    df = pd.read_csv(data_file, encoding='latin-1')
    return df

# Return a list with the possible sentiments that a tweet might have.
def get_sentiments(df):
    return df['Sentiment'].unique()

# Return a string containing the second most popular sentiment among the tweets.
def second_most_popular_sentiment(df):
    return df['Sentiment'].value_counts().index.tolist()[1]

# Return the date (string as it appears in the data) with the greatest number of extremely positive tweets.
def date_most_popular_tweets(df):
    new_df = df[df['Sentiment'] == 'Extremely Positive']
    return "{}".format(new_df['TweetAt'].mode().iloc[0])

# Modify the dataframe df by converting all tweets to lower case. 
def lower_case(df):
    df['OriginalTweet'] = [str(i).lower() for i in df['OriginalTweet']]
    return df

# Modify the dataframe df by replacing each characters which is not alphabetic or whitespace with a whitespace.
def remove_non_alphabetic_chars(df):
#     df['OriginalTweet'] = [re.sub('[^a-zA-Z]', ' ', str(i)) for i in df['OriginalTweet']]
    df["OriginalTweet"] = df["OriginalTweet"].str.replace('[^a-zA-Z]', ' ', regex=True)
    return df

# Modify the dataframe df with tweets after removing characters which are not alphabetic or whitespaces.
def remove_multiple_consecutive_whitespaces(df):
#     df['OriginalTweet'] = [re.sub(' +', ' ', str(i)) for i in df['OriginalTweet']]
#     df['OriginalTweet'] = [str(i).str.replace(' +', ' ', regex=True) for i in df['OriginalTweet']]
    df["OriginalTweet"] = df["OriginalTweet"].str.replace(' +', ' ', regex=True)
    return df

# Given a dataframe where each tweet is one string with words separated by single whitespaces,
# tokenize every tweet by converting it into a list of words (strings).
def tokenize(df):
    df['OriginalTweet'] = [str(i).split() for i in df['OriginalTweet']]
    return df

# Given dataframe tdf with the tweets tokenized, return the number of words in all tweets including repetitions.
def count_words_with_repetitions(tdf):
    return sum([len(i) for i in tdf['OriginalTweet']])

# Given dataframe tdf with the tweets tokenized, return the number of distinct words in all tweets.
def count_words_without_repetitions(tdf):
    tweets = []
    for i in tdf['OriginalTweet']:
        tweets.extend(i)
    unique_words = set(tweets)
    num_of_unique_words = len(unique_words)
    return num_of_unique_words

# Given dataframe tdf with the tweets tokenized, return a list with the k distinct words that are most frequent in the tweets.
def frequent_words(tdf,k):
    freq_words = []
    [freq_words.extend(i) for i in tdf['OriginalTweet']]
    counter = Counter(freq_words)
    return [item[0] for item in counter.most_common(k)]

# Given dataframe tdf with the tweets tokenized, remove stop words and words with <=2 characters from each tweet.
# The function should download the list of stop words via:
# https://raw.githubusercontent.com/fozziethebeat/S-Space/master/data/english-stop-words-large.txt
def remove_stop_words(tdf):
    data = urllib.request.urlopen('https://raw.githubusercontent.com/fozziethebeat/S-Space/master/data/english-stop-words-large.txt')
    all_words = []
    for line in data: # files are iterable
        line = line.decode("utf-8")
        line = line.replace('\n', '')
        all_words.append(line)

    tdf['OriginalTweet'] = tdf['OriginalTweet'].apply(lambda row: [word for word in row if word not in all_words and len(word) > 2])
    return tdf

# Given dataframe tdf with the tweets tokenized, reduce each word in every tweet to its stem.
def stemming(tdf):
    tdf['OriginalTweet'] = tdf['OriginalTweet'].apply(lambda row: [PorterStemmer().stem(word) for word in row])
#     tdf['OriginalTweet'] = remove_stop_words(tdf)['OriginalTweet']
#     for i in tdf['OriginalTweet']:
#         tdf['OriginalTweet'] = [PorterStemmer().stem(y) for y in i]
#         tdf['OriginalTweet'] = [PorterStemmer().stem(y) for y in tdf['OriginalTweet']]
    return tdf

# Given a pandas dataframe df with the original coronavirus_tweets.csv data set,
# build a Multinomial Naive Bayes classifier. 
# Return predicted sentiments (e.g. 'Neutral', 'Positive') for the training set
# as a 1d array (numpy.ndarray). 
def mnb_predict(df):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(4,6))
    y = df['Sentiment']
    stemming_tokenize = stemming(tokenize(df))
    data = stemming_tokenize['OriginalTweet'].apply(lambda i: " ".join(i))
    x = vectorizer.fit_transform(data)
    clf = nb.MultinomialNB()
    clf.fit(x, y)
    y_hat = clf.predict(x)
    return y_hat

# Given a 1d array (numpy.ndarray) y_pred with predicted labels (e.g. 'Neutral', 'Positive') 
# by a classifier and another 1d array y_true with the true labels, 
# return the classification accuracy rounded in the 3rd decimal digit.
def mnb_accuracy(y_pred,y_true):
    count = 0
    for i in range(len(y_true)):
        if y_pred[i] == y_true[i]:
            count += 1
    score = round(count / len(y_true), 3)
    return score

In [2]:
# Running all functions
# feel free to comment out lines you don't want to run

df = read_csv_3('data/coronavirus_tweets.csv')
# print(df)
print("second_most_popular_sentiment:", second_most_popular_sentiment(df))
print("get_sentiments:", get_sentiments(df))
print("date_most_popular_tweets:", date_most_popular_tweets(df))
# print("lower_case:", lower_case(df))
# print("remove_non_alphabetic_chars:", remove_non_alphabetic_chars(df))
# print("remove_multiple_consecutive_whitespaces:", remove_multiple_consecutive_whitespaces(df))
tdf=tokenize(df)
print("count_words_with_repetitions:", count_words_with_repetitions(tdf))
print("count_words_without_repetitions:", count_words_without_repetitions(tdf))
print("frequent_words:", frequent_words(tdf,10))
print("remove_stop_words:", remove_stop_words(tdf))

print("stemming:", stemming(tdf))
y_pred = mnb_predict(stemming(tokenize(df)))
print("y_pred:", y_pred)
print("mnb_accuracy:", mnb_accuracy(y_pred, df['Sentiment'].tolist()))

second_most_popular_sentiment: Negative
get_sentiments: ['Neutral' 'Positive' 'Extremely Negative' 'Negative' 'Extremely Positive']
date_most_popular_tweets: 25-03-2020
count_words_with_repetitions: 1255301
count_words_without_repetitions: 136386
frequent_words: ['the', 'to', 'and', 'of', 'a', 'in', 'for', '#coronavirus', 'is', 'are']
remove_stop_words:        UserName  ScreenName                      Location     TweetAt  \
0          3799       48751                        London  16-03-2020   
1          3800       48752                            UK  16-03-2020   
2          3801       48753                     Vagabonds  16-03-2020   
3          3802       48754                           NaN  16-03-2020   
4          3803       48755                           NaN  16-03-2020   
...         ...         ...                           ...         ...   
41152     44951       89903  Wellington City, New Zealand  14-04-2020   
41153     44952       89904                           NaN  1