# The analysis of Twitter users' reception of Summer Game Fest

## Downloading tweets with snscrape

In [1]:
# Scraping the number of comments, retweets and favourites requires the developer version of snscrape.
import snscrape.modules.twitter as sntwitter
import pandas as pd

Downloaded tweets need to: include the hashtag "SummerGameFest", be in the english language and have some form o reaction from the users. I've decided to only use tweets coming from verified users - that'll help me avoid comments from the press or the organizers of the event. This tutorial on github has helped me with the syntax of Twitter filters: https://github.com/igorbrigadir/twitter-advanced-search.

In [None]:
tweets = []

max_tweets = 20000

for count, tweet in enumerate(sntwitter.TwitterSearchScraper('#SummerGameFest lang:en since:2022-06-09 until:2022-06-14 filter:has_engagement -filter:verified').get_items()):
    if count > max_tweets:
        break
    tweets.append([tweet.date, tweet.user.username, tweet.content, tweet.replyCount, tweet.retweetCount, tweet.likeCount])
        
twitter_df = pd.DataFrame(tweets, columns = ["Date", "User", "Text", "Replies", "Retweets", "Likes"])

In [None]:
twitter_df.shape

In [None]:
twitter_df.head()

In [None]:
twitter_df.tail()

In [None]:
#twitter_df.to_csv("game_fest.csv")

## roBERTa model for sentiment analysis of tweets

For sentiment analysis, I employed the model from: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from scipy.special import softmax

labels = ["negative", "neutral", "positive"]

def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        if t.startswith('@') and len(t) > 1:
            t = '@user' 
        elif t.startswith('http'):
            t = 'http'
        new_text.append(t)
    return " ".join(new_text)


roberta = "cardiffnlp/twitter-roberta-base-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(roberta)

In [None]:
auto_tokenizer = AutoTokenizer.from_pretrained(roberta)

import numpy as np

def get_sentiment(tweet):
    """
    Input: Tweets from Twitter
    Usage: get_sentiment() classifies the tweet into one of three labels
    Output: Appropriate label of the tweet
    """
    encoded_tweet = auto_tokenizer(preprocess(tweet), return_tensors='pt')

    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    
    return labels[ranking[0]]

## Cleaning and processing data

In [None]:
import pandas as pd

In [None]:
#game_fest = pd.read_csv("game_fest.csv")

Creating a column with the sentiment of tweets. This process takes some time to finish.

In [None]:
#game_fest["Sentiment"] = [get_sentiment(tweet) for tweet in game_fest["Text"]]
#game_fest.to_csv("game_fest.csv")

In orderd to keep the research family friendly, I've deleted profanities from the text (it is optional, though). This process can also take quite some time.

In [None]:
#from better_profanity import profanity

#game_fest["Text"] = game_fest["Text"].map(lambda x: profanity.censor(x))

#game_fest.to_csv("game_fest_censor.csv")

In [None]:
game_fest = pd.read_csv("game_fest_censor.csv")

After deleting duplicates, I am left with 14683 tweets.

In [None]:
game_fest = game_fest.drop_duplicates(subset=["Text"])
game_fest.shape

For tokenization I've chosen the TweetTokenizer from nltk.tokenize module, as it is the best suited tool for tokenizing tweets.

In [None]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True)

game_fest["Text"] = game_fest["Text"].apply(lambda x: tokenizer.tokenize(x))
game_fest["Text"] = [[token.lower() for token in tokens] for tokens in game_fest["Text"]]

All the links have been deleted with regular expressions.

In [None]:
import re
game_fest["Text"] = [[re.sub(r'http\S+', '', token) for token in tokens] for tokens in game_fest["Text"]]

Deleting stopwords.

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

game_fest["Text"] = [[token for token in tokens if token not in stop_words] for tokens in game_fest["Text"]]

Lemmatization of the tokens.

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

game_fest["Text"] = [[lemmatizer.lemmatize(token) for token in tokens] for tokens in game_fest["Text"]]

Getting rid of punctuation and numbers.

In [None]:
import string

game_fest["Text"] = [[token for token in tokens if token not in string.punctuation] for tokens in game_fest["Text"]]
game_fest["Text"] = [[re.sub('[,\\.!?]', '', token) for token in tokens] for tokens in game_fest["Text"]]
game_fest["Text"] = [[token for token in tokens if not token.isnumeric()] for tokens in game_fest["Text"]]

Deleting tokens shorter than 1 characters will help avoid emoticons, which have no purpose at this stage of the analysis.

In [None]:
game_fest["Text"] = [[token for token in tokens if len(token)>1] for tokens in game_fest["Text"]]

Deleting words common in hashtags related to the event.

In [None]:
delete_words = ['summergamefest', 'pcgamingshow', 'xboxbethesda', 'dayofthedevs']
game_fest["Text"] = [[token for token in tokens if not any(word in token for word in delete_words)] for tokens in game_fest["Text"]]

N-grams will help in interpretation of topics from LDA.

In [None]:
from gensim.models import Phrases
from gensim.models import phrases

bigram = Phrases(game_fest["Text"], min_count=5, threshold=100)
trigram = Phrases(bigram[game_fest["Text"]], threshold=100)

bigram_mod = phrases.Phraser(bigram)
trigram_mod = phrases.Phraser(trigram)

game_fest["Text"] = [bigram_mod[tokens] for tokens in game_fest["Text"]]
game_fest["Text"] = [trigram_mod[tokens] for tokens in game_fest["Text"]]

I'll decide the minimum length of tweet based on the countplot of tweets' lenghts.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

len_tweets = [len(tweet) for tweet in game_fest["Text"]]

warnings.filterwarnings("ignore", category=DeprecationWarning) 

plt.figure(figsize = (10,5))
sns.set_theme(style="darkgrid")
sns.countplot(x = len_tweets)
plt.xlabel('Liczba słów', fontsize = 16)
plt.ylabel('Liczba tweetów', fontsize = 16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.tight_layout()
#plt.savefig('tweets_lengths.png')
plt.show()

Tweets with at least 4 words will be left in the dataset. This gives me 11855 tweets to analyze.

In [None]:
game_fest = game_fest[game_fest['Text'].map(len)>3]
game_fest.shape

## Timeline tweets' sentiment

In [None]:
game_fest['Dates'] = pd.to_datetime(game_fest['Date']).dt.date
game_fest['Hour'] = pd.to_datetime(game_fest['Date']).dt.floor('H').dt.time

game_fest['Dates'] = game_fest['Dates'].map(lambda x: str(x))
game_fest['Hour'] = game_fest['Hour'].map(lambda x: str(x))
game_fest['Hour'] = game_fest['Hour'].map(lambda x: x[:5])

game_fest['Date'] = game_fest['Dates']+' '+game_fest['Hour']

ts_df = game_fest.groupby(by=["Date"])["Sentiment"].value_counts().unstack()
ts_df=ts_df.fillna(0)

Publication time is in the UTC time zone.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning) 

plt.figure(figsize = (10,6))
sns.set_theme(style="darkgrid")
sns.lineplot(x='Date', y='positive', color='blue', data = ts_df)
sns.lineplot(x='Date', y='neutral', color='green', data = ts_df)
ts_plot=sns.lineplot(x='Date', y='negative', color='red', data = ts_df)
ts_plot.set_xlabel("")
ts_plot.set_ylabel("Liczba tweetów", fontsize = 16)
plt.xticks(fontsize=14, rotation='vertical')
plt.yticks(fontsize=14)
ts_plot.set(yscale='log')
ts_plot.set(ylim=1)

plt.axvline("2022-06-09 00:00", color='black', linestyle="--")
plt.axvline("2022-06-10 00:00", color='black', linestyle="--")
plt.axvline("2022-06-11 00:00", color='black', linestyle="--")
plt.axvline("2022-06-12 00:00", color='black', linestyle="--")
plt.axvline("2022-06-13 00:00", color='black', linestyle="--")
plt.legend(labels=["pozytywny", "neutralny", "negatywny"], fontsize = 16)

for ind, label in enumerate(ts_plot.get_xticklabels()):
    if ind % 24 == 0:
        label.set_visible(True)
    else:
        label.set_visible(False)

plt.tight_layout()
#plt.savefig('tweets_time.png')
plt.show()

The reception of the event was generally positive. The most popular days were the 9th and 12th of June.

## Wordcloud

In [None]:
game_fest_pos=game_fest.loc[game_fest.Sentiment == "positive"]
game_fest_neg=game_fest.loc[game_fest.Sentiment == "negative"]

In [None]:
long_pos = []

long_neg = []

for tweet in game_fest_pos['Text']:
    long_pos.append(' '.join(tweet))

for tweet in game_fest_neg['Text']:
    long_neg.append(' '.join(tweet))
    
print("Vocabulary's lenght in postive tweets: ", len(set(long_pos)))

print("Vocabulary's lenght in negative tweets:: ", len(set(long_neg)))

long_pos = ','.join(long_pos)
long_neg = ','.join(long_neg)

### Positive

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=1600, height=800, background_color="white", max_words=500).generate(long_pos)
plt.figure(figsize=(10,5))
plt.axis("off")
plt.tight_layout(pad=0)
plt.imshow(wordcloud)
#plt.savefig('pos_wordcloud.png')

### Negative

In [None]:
wordcloud = WordCloud(width=1600, height=800, background_color="black", max_words=500).generate(long_neg)
plt.figure(figsize=(10,5))
plt.axis("off")
plt.tight_layout(pad=0)
plt.imshow(wordcloud)
#plt.savefig('neg_wordcloud.png')

## Model LDA

In [None]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary

dictionary = Dictionary(game_fest["Text"])
corpus = [dictionary.doc2bow(tweet) for tweet in game_fest["Text"]]
id2word=dictionary

### Coherence score

In [None]:
import tqdm
from gensim.models import CoherenceModel
import numpy as np
import warnings

In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    """
    Input: Model's paramethers
    Purpose: Function outputs the coherence score for different LDA models.
    Output: Coherence score
    """
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=k, 
                         random_state=100,
                         chunksize=len(corpus),
                         passes=10,
                         alpha=a,
                         eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=game_fest['Text'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {} 

min_topics = 4
max_topics = 22
step_size = 2
topics_range = range(min_topics, max_topics, step_size)

warnings.filterwarnings("ignore", category=DeprecationWarning) 

num_of_tweets = len(corpus)
corpus_sets = [corpus]
corpus_title = ['100% Corpus']

alpha=['symmetric', 'asymmetric']
beta=['symmetric']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

if 1 == 1:
    pbar = tqdm.tqdm(total=len(corpus_sets)*len(topics_range)*len(alpha)*len(beta))
    
    for i in range(len(corpus_sets)):
        for k in topics_range:
            for a in alpha:
                for b in beta:
                    print(i, k, a, b)
                    
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word,
                                                            k=k, a=a, b=b)
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    print({k: v[-1] for k, v in model_results.items()})

                    pbar.update(1)
    pbar.close()
    model_results=pd.DataFrame(model_results)

### Final model

In [None]:
num_topics=20
passes=10

lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=len(corpus),
    alpha='asymmetric',
    eta='symmetric',
    random_state=100,
    num_topics=num_topics,
    passes=passes,
    per_word_topics=True
)

### Topic visualisation

In [None]:
import pyLDAvis.gensim_models
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

vis=pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
#pyLDAvis.save_html(vis, 'lda.html')

In [None]:
vis

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

cloud = WordCloud(background_color='white',
                  width=1200,
                  height=1600,
                  max_words=30,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = [lda_model.show_topic(0, 30), lda_model.show_topic(7, 30), lda_model.show_topic(8, 30)]
topic_nums = ["1", "6", "8"]

fig, axes = plt.subplots(1, 3, figsize=(12,16), sharex=True, sharey=True)

for i, (ax, num) in enumerate(zip(axes.flatten(), topic_nums)):
    fig.add_subplot(ax)
    topic_words = dict(topics[i])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + num, fontdict=dict(size=26))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.tight_layout(pad=0)
plt.imshow(cloud)
plt.savefig('topics_lda.png', bbox_inches='tight')