In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xuruoxin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/xuruoxin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [83]:
# General Imports 
import pandas as pd
import numpy as np
import re
import random
import matplotlib.pyplot as plt
from typing import List
import time

import warnings
warnings.filterwarnings('ignore')

In [81]:
# NLTK Imports 
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk import SnowballStemmer
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xuruoxin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/xuruoxin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [12]:
import ipywidgets as widgets

import matplotlib.pyplot as plt
import numpy as np


In [114]:
# load data
df = pd.read_csv('covid2020.csv')

In [115]:
# do some basic cleaning of the data
# drop duplicates
df = df.drop_duplicates(subset='text')
# drop the rows that user_followers =0
df = df[df['user_followers'] != 0]
# drop NA values
df = df.dropna()
# reset index
df = df.reset_index(drop=True)


In [116]:
# preprocess the text column and do text cleaning(most common methods are covered)
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    
    # remove URLs
    text = re.sub(r"http\S+", "", text)
    
    # remove numbers
    text = re.sub(r"\d", "", text)
    
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Removing extra spaces
    text = " ".join(tokens)
    text = re.sub(' +', ' ', text)
    
    # Removing Emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Removing emoticons
    text = re.sub(r':\w+:', '', text)
    
    # Removing Contractions
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    
    return text


In [117]:
df["text"] = df["text"].apply(preprocess_text)

In [118]:
def remove_emojis(column):
    return re.sub(r'[^\x00-\x7F]+', '', column)

df["user_name"] = df["user_name"].apply(remove_emojis)
df["user_location"] = df["user_location"].apply(remove_emojis)
df["user_description"] = df["user_description"].apply(remove_emojis)


In [119]:
# Clean the user_location column
df['user_location'] = df['user_location'].str.replace('[^\w\s]','')
df = df[df['user_location'].notna()]
text = " ".join(df['user_location'].values)

Background
Problem statement: Retail industries, such as epidemic prevention supplies companies, need to consider the future impact of COVID-19 to develop appropriate marketing strategies and risk management plans. 

Real world impact: Retail industries need to be prepared to adjust their business models and sales strategies quickly. Covid-19 has forced consumers to rely more heavily on online shopping, which means retailers need to further strengthen their website capabilities, such as virtual fitting rooms and online payment options. Besides, retailers need to strengthen their supply chain to ensure timely delivery and adequate inventory.
By providing retailers with our semantic clustering model for covid-19, retailers can make preliminary predictions about consumers’ buying habits and spending patterns.



In [120]:
# interactive widget
def unique_sorted_values(array):
    unique = array.unique().tolist()
    unique.sort()
    return unique

In [121]:
df_location = ['United States', 'United Kingdom','India','Canada','Switzerland','China']

### Sentiment Analysis

In [123]:
dropdown_country = widgets.Dropdown(options = unique_sorted_values(pd.Series(df_location)))
output_country = widgets.Output()

def dropdown_country_eventhandler(change):
    output_country.clear_output()
    with output_country:
        country_df = df.loc[df.user_location == change.new]
# sentiment analysis using the VADER lexicon
        sia = SentimentIntensityAnalyzer()
        country_df['sentiment_score'] = country_df['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
        country_df['sentiment'] = country_df['sentiment_score'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral'))

# Plot the count of each sentiment
        plt.figure(figsize=(6, 4))
        sns.countplot(country_df['sentiment'], order=['Positive', 'Neutral', 'Negative'])
        plt.xlabel("Sentiment")
        plt.ylabel("Count")
        plt.title("Count of Sentiments in the Tweet dataset")
        plt.show()


dropdown_country.observe(dropdown_country_eventhandler, names = 'value')
display(dropdown_country)

display(output_country)

Dropdown(options=('Canada', 'China', 'India', 'Switzerland', 'United Kingdom', 'United States'), value='Canada…

Output()

Retailers can choose to understand the sentiment analysis of different countries. By selecting a specific country, retailers can know whether the country's overall attitude towards covid-19 is positive, negative or neural. This would help them to decide which country may have a high demand for their products or which country probably can be their raw material processing location, so as to seize business opportunities and develop product import and export trade.

### Wordclouds

In [124]:
dropdown_country1 = widgets.Dropdown(options = unique_sorted_values(pd.Series(df_location)))
output_country1 = widgets.Output()

def dropdown_country_eventhandler1(change):
    output_country1.clear_output()
    with output_country1:
        

# Remove stop words and calculate the frequency of the words
        country_df = df.loc[df.user_location== change.new]
        stop_words = nltk.corpus.stopwords.words('english')
        all_words = ' '.join(country_df['text'].tolist())
        words = nltk.word_tokenize(all_words)
        words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
        fdist = nltk.FreqDist(words)

# Plot the word cloud of the most common words
        wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(fdist)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()

dropdown_country1.observe(dropdown_country_eventhandler1, names = 'value')
display(dropdown_country1)

display(output_country1)


Dropdown(options=('Canada', 'China', 'India', 'Switzerland', 'United Kingdom', 'United States'), value='Canada…

Output()

By clicking different countries, retailers can obtain a word cloud that contains the most frequent words of covid tweets for that country. Through keywords generated by our model, retailers can analyze consumers’ life status, such as financial situation, emotional needs, and demand for epidemic prevention supplies. It is helpful for them to adjust product prices, advertising content, and inventory accordingly. 
