### **Importing the libraries required for this project.**


In [445]:
# The Pandas library will be used for preprocessing and organizing text data into a data frame for further analysis.

import pandas as pd

In [446]:
# Numpy will be used for performing numerical computations on textual data converted into numerical vectors.

import numpy as np

In [447]:
# The string library will be used for string manipulation.

import string

In [448]:
# The re (regular expression) library will be used for pattern matching in strings.

import re

In [None]:
# NLTK provides tools and algorithms for preprocessing, feature extraction, and sentiment classification of text data.

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')

In [450]:
# The TextBlob library will be used with the re library to preprocess text data and classify the sentiment of the text.

from textblob import TextBlob

In [451]:
# The collections library provides tools to count and store occurrences of words and phrases.

from collections import Counter

In [452]:
# Wordcloud will be used to create word clouds from text data.

from wordcloud import WordCloud

In [453]:
# Matplotlib will be used for data visualisation. 

import matplotlib.pyplot as plt


In [454]:
# Seaborn will be used for data visualisation. 

import seaborn as sns




---



### **Reading in the data**

In [455]:
df = pd.read_csv("/content/drive/MyDrive/Colab/4 Chat GPT Sentiment/dataset/chatgpt_tweets.csv")



---



### **Exploring the data**

In [456]:
# Viewing the first five rows of the dataset for a brief overview of the structure and composition of the DataFrame. 

df.head()

Unnamed: 0,tweet_id,created_at,like_count,quote_count,reply_count,retweet_count,tweet,country,photo_url,city,country_code
0,1598014056790622225,2022-11-30 18:00:15+00:00,2,0,0,0,ChatGPT: Optimizing Language Models for Dialog...,,,,
1,1598014522098208769,2022-11-30 18:02:06+00:00,12179,889,1130,3252,"Try talking with ChatGPT, our new AI system wh...",,,,
2,1598014741527527435,2022-11-30 18:02:58+00:00,2,0,0,1,ChatGPT: Optimizing Language Models for Dialog...,,https://pbs.twimg.com/media/Fi1J8HbWAAMv_yi.jpg,,
3,1598015493666766849,2022-11-30 18:05:58+00:00,561,8,25,66,"THRILLED to share that ChatGPT, our new model ...",,https://pbs.twimg.com/media/Fi1Km3WUYAAfzHS.jpg,,
4,1598015509420994561,2022-11-30 18:06:01+00:00,1,0,0,0,"As of 2 minutes ago, @OpenAI released their ne...",,,,


<p>

In [457]:
# Identifying the types of data in each column. 

df.dtypes

tweet_id          int64
created_at       object
like_count        int64
quote_count       int64
reply_count       int64
retweet_count     int64
tweet            object
country          object
photo_url        object
city             object
country_code     object
dtype: object

<p>

In [458]:
# A summary of the dataframe, including the number of non-null values and the data type of each column.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219294 entries, 0 to 219293
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   tweet_id       219294 non-null  int64 
 1   created_at     219294 non-null  object
 2   like_count     219294 non-null  int64 
 3   quote_count    219294 non-null  int64 
 4   reply_count    219294 non-null  int64 
 5   retweet_count  219294 non-null  int64 
 6   tweet          219294 non-null  object
 7   country        3648 non-null    object
 8   photo_url      68446 non-null   object
 9   city           3648 non-null    object
 10  country_code   3645 non-null    object
dtypes: int64(5), object(6)
memory usage: 18.4+ MB


<p>

In [459]:
# Calculating the percentage of missing values in each column, and printing the column name and percentage of missing values in that column.

for col in df.columns:
  pct_missing = np.mean(df[col].isnull())
  print('{} - {}%'.format(col, pct_missing))

tweet_id - 0.0%
created_at - 0.0%
like_count - 0.0%
quote_count - 0.0%
reply_count - 0.0%
retweet_count - 0.0%
tweet - 0.0%
country - 0.9833647979424882%
photo_url - 0.6878801973606209%
city - 0.9833647979424882%
country_code - 0.9833784782073381%


<p>



In [460]:
# Returns the sum of missing values in each column of the dataframe.

df.isnull().sum()


tweet_id              0
created_at            0
like_count            0
quote_count           0
reply_count           0
retweet_count         0
tweet                 0
country          215646
photo_url        150848
city             215646
country_code     215649
dtype: int64



---



### **Data manipulation**

In [461]:
# Making a copy of the dataframe for further exploration and data manipulation. 

df_new=df.copy()

<p>



In [462]:
# Dropping the columns 'country', 'photo_url', 'city', and 'country_code' from the dataframe using the drop() method.

df_new = df.drop(['country', 'photo_url', 'city', 'country_code'], axis=1)


<p>

In [463]:
df_new.head()

Unnamed: 0,tweet_id,created_at,like_count,quote_count,reply_count,retweet_count,tweet
0,1598014056790622225,2022-11-30 18:00:15+00:00,2,0,0,0,ChatGPT: Optimizing Language Models for Dialog...
1,1598014522098208769,2022-11-30 18:02:06+00:00,12179,889,1130,3252,"Try talking with ChatGPT, our new AI system wh..."
2,1598014741527527435,2022-11-30 18:02:58+00:00,2,0,0,1,ChatGPT: Optimizing Language Models for Dialog...
3,1598015493666766849,2022-11-30 18:05:58+00:00,561,8,25,66,"THRILLED to share that ChatGPT, our new model ..."
4,1598015509420994561,2022-11-30 18:06:01+00:00,1,0,0,0,"As of 2 minutes ago, @OpenAI released their ne..."


<p>



In [464]:
# Calculating the number of words in each tweet and adding a new column for 'word count' to the dataframe.

def number_of_words(df_new):
    df_new['word_count'] = df_new['tweet'].apply(lambda x : len(str(x).split(" ")))

<p>



In [465]:
# Calculating the total number of characters in each tweet (excluding spaces, punctuation marks, and other special characters).

def char_count(text):
    charc = 0
    for char in text:
        if char != " ":
            charc += 1
    return charc

<p>



In [466]:
# Adding a new column 'stopwords_count' to the data frame that counts the number of stop words in each row of the 'tweet' column.

stop_words = stopwords.words('english')
def num_of_stopwords(df_new):
    df_new['stopwords_count'] = df_new['tweet'].apply(lambda x: len([x for x in x.split() if x in stop_words]))

<p>



In [467]:
# Adding a new column 'hashtag_count' to the data frame that counts the number of hashtags in each row of the 'tweet' column.

def num_of_hashtags(df_new):
    df_new['hashtag_count'] = df_new['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
  

<p>



In [468]:
# Defining a function 'feat_extract' that adds four new columns to the data frame using the functions 'num_of_hashtags', 'number_of_words', 'char_count', and 'num_of_stopwords'.

def feat_extract(df_new):
    num_of_hashtags(df_new)
    number_of_words(df_new)
    df_new['char_count']=df_new['tweet'].apply(char_count)
    num_of_stopwords(df_new)



---



### **Text preprocessing**

In [469]:
# Defining a function called 'remove_emoji' that takes a string as input and uses a regular expression to remove any Unicode characters that match specific ranges. 

def remove_emoji(string):
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                           u"\U0001F1E0-\U0001F1FF"  # Regional Indicator Symbols
                           u"\U00002702-\U000027B0"  # Miscellaneous Symbols and Arrows
                           u"\U000024C2-\U0001F251"  # Enclosed Characters
                           "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string) 

<p>



In [470]:
# Defining a function called 'remove_apostrophe' function which uses a regular expression to remove apostrophes from a string. 

def remove_apostrophe(string):
    apostrophe_pattern = re.compile(r"[’‘]")  # Matches apostrophes
    return apostrophe_pattern.sub('', string)

<p>



In [471]:
# The 'hyperlinks' function removes hyperlinks and various HTML tags from a string.

def hyperlinks(text):
    temp = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.',text)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('&gt;', "", temp) 
    temp = re.sub('&#x27;', "'", temp) 
    temp = re.sub('&#x2F;', ' ', temp)
    temp = re.sub('<p>', ' ', temp) 
    temp = re.sub('<i>', ' ', temp)
    temp = re.sub('</i>', '', temp) 
    temp = re.sub('&#62;', '', temp)
    temp = re.sub("\n", '', temp)
    return temp

<p>



In [472]:
# This 'clean_tweet' function takes a tweet text as input and performs several cleaning tasks to preprocess the text.

def clean_tweet(tweet):
    temp=tweet.lower() # Converts the tweet text to lowercase.
    temp = re.sub("'", "", temp) # Avoids removing contractions in English.
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # Removes apostrophes and mentions (e.g., "@username").
    temp = re.sub("chatgpt","",temp) # Removes the word "chatgpt".
    temp = re.sub(r'http\S+', '', temp) # Removes hyperlinks.
    temp = re.sub('[()!?]', ' ', temp) # Replaces all parentheses, exclamation marks, and question marks with a space.
    temp = re.sub('\[.*?\]',' ', temp) # Removes square brackets (i.e., removes text that is inside square brackets).
    punc=string.punctuation # Removes punctuation marks. 
    temp=temp.translate(str.maketrans('','',punc)) #Removes all the punctuation marks from the given string 'temp'.
    
    #Removing stopwords.
    new_list=[]
    words=word_tokenize(temp)
    sws=stopwords.words('english')
    for word in words:
        if word not in sws:
            new_list.append(word)
    
    temp=' '.join(new_list)
    return temp

<p>



In [473]:
# Using the TextBlob library to correct any spelling errors in the dataset.

def spell_correct(df2):
    df2['tweet'].apply(lambda x: str(TextBlob(x).correct()))



---



### **Lemmatization**

In [474]:
# This 'lemmatize' function tokenizes the text and lemmatizes each word using the WordNetLemmatizer from the nltk library, and then joins the resulting list of lemmatized words back into a string.

def lemmatize(text):    
    new_list=[]
    lemma=WordNetLemmatizer()
    words=word_tokenize(text)
    for word in words:
        new_list.append(lemma.lemmatize(word))
    
    return ' '.join(new_list)

<p>

In [None]:
#Applying the previously defined functions to the dataframe to remove hyperlinks, emojis, apostrophes, lemmatize the text and perform additional text cleaning.

feat_extract(df_new)

df_new['tweet'] = df_new['tweet'].apply(hyperlinks)
df_new['tweet'] = df_new['tweet'].apply(remove_emoji)
df_new['tweet'] = df_new['tweet'].apply(remove_apostrophe)
df_new['tweet'] = df_new['tweet'].apply(clean_tweet)
df_new['final_tweet'] = df_new['tweet'].apply(lemmatize)

<p>



In [None]:
df_new.head()




---



### **Top 10 most-engaged tweets**

In [None]:
top_tweets_df = df_new.sort_values(by='like_count', ascending=False)
top_tweets_df.head()



---



### **Sentiment Analysis**

In [None]:
# Initialises an instance of the SentimentIntensityAnalyzer class from the NLTK library.

sid=SentimentIntensityAnalyzer()

<p>



In [None]:
#Using the SentimentIntensityAnalyzer from the NLTK library to calculate the compound polarity score, neutrality score, negativity score, and positivity score for each tweet in the 'final_tweet' column.

df_new['sentiment_compound_polarity']=df_new.final_tweet.apply(lambda x:sid.polarity_scores(x)['compound'])
df_new['sentiment_neutral']=df_new.final_tweet.apply(lambda x:sid.polarity_scores(x)['neu'])
df_new['sentiment_negative']=df_new.final_tweet.apply(lambda x:sid.polarity_scores(x)['neg'])
df_new['sentiment_pos']=df_new.final_tweet.apply(lambda x:sid.polarity_scores(x)['pos'])
df_new['sentiment_type']=''

df_new.loc[df_new.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
df_new.loc[df_new.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
df_new.loc[df_new.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'
df_new.head()

<p>



In [None]:
# Exporting the output to csv.

df_new.to_csv('sentiment.csv', index=False)



---



### **Sentiment: Bar Chart**



In [None]:
colors = ["#90CECC", "#E8D5A1", "#E3BAB3"]

sns.barplot(data=df_new,x=df_new['sentiment_type'].value_counts().index,y=df_new['sentiment_type'].value_counts(), palette=colors)
plt.title('Sentiment breakdown', fontsize=12)




---



### **Sentiment: Pie Chart**

In [None]:
colors = ["#90CECC", "#E8D5A1", "#E3BAB3"]

counts = df_new['sentiment_type'].value_counts()
fig, ax = plt.subplots()
ax.pie(counts.values, labels=counts.index, autopct='%1.1f%%', startangle=90, colors=colors)
ax.set_title('Sentiment breakdown')
plt.show()



---



### **Conversation volume over time**

In [None]:
# Converting the date to a 'datetime' format
df_new['created_at'] = pd.to_datetime(df_new['created_at'])

# Grouping the data by day and calculating the count
df_daily_count = df_new.groupby(df_new['created_at'].dt.date).size()

# Plotting the data on a line chart
plt.figure(figsize=(20, 6))
plt.plot(df_daily_count.index, df_daily_count.values, color='#E8D5A1')
plt.title('Conversation volume over time')
plt.xlabel('Date')
plt.ylabel('Tweet Count')
plt.show()



---



### **Average sentiment over time**

In [None]:
#Converting the date to a 'datetime' format.
df_new['created_at'] = pd.to_datetime(df_new['created_at'])

#Grouping the average sentiment data by day. 
df_sentiment_over_time = df_new.groupby(pd.Grouper(key='created_at', freq='D')).mean(numeric_only=True)

#Plotting the data on a line chart. 
plt.figure(figsize=(20, 6))
plt.plot(df_sentiment_over_time.index, df_sentiment_over_time['sentiment_compound_polarity'], color='#E8D5A1')
plt.title('Sentiment Over Time')
plt.xlabel('Date')
plt.ylabel('Sentiment Compound Polarity')
plt.show()



---



### **Sentiment over time: Daily count per sentiment category**

In [None]:
# Group the data by daily time period and sentiment category, and calculating the count
df_tweet_count = df_new.groupby([pd.Grouper(key='created_at', freq='D'), 'sentiment_type']).size().unstack()

# Plot the count of tweets for each sentiment category over time
plt.figure(figsize=(20, 6))
plt.plot(df_tweet_count.index, df_tweet_count['POSITIVE'], label='Positive', color='#90CECC')
plt.plot(df_tweet_count.index, df_tweet_count['NEGATIVE'], label='Negative', color='#E3BAB3')
plt.plot(df_tweet_count.index, df_tweet_count['NEUTRAL'], label='Neutral', color='#E8D5A1')

plt.title('Daily Count of Tweets per Sentiment Category')
plt.xlabel('Date')
plt.ylabel('Tweet Count')
plt.legend()
plt.show()



---



### **Engagement over time**

In [None]:
# Converting the date to a 'datetime' format
df_new['created_at'] = pd.to_datetime(df_new['created_at'])

# Calculating the total engagement per day.
daily_engagement = df_new[['like_count', 'quote_count', 'reply_count', 'retweet_count']].sum(axis=1)

# Grouping the data by day
df_daily_engagement = df_new.groupby(pd.Grouper(key='created_at', freq='D')).sum(numeric_only=True)

# Plotting the data on a line chart
plt.figure(figsize=(20, 6))
plt.plot(df_daily_engagement.index, df_daily_engagement['like_count'] + df_daily_engagement['quote_count'] + df_daily_engagement['reply_count'] + df_daily_engagement['retweet_count'], color='#E8D5A1')
plt.title('Engagement Over Time')
plt.xlabel('Date')
plt.ylabel('Engagement per Day')
plt.show()




---



### **Word Cloud: Positive Tweets**

In [None]:
# Creating a new data frame that shows the 30 most common words in the 'final_tweet' column of 'eda'. 

eda=df_new.copy()

eda['temp_list'] = eda['final_tweet'].apply(lambda x:str(x).split())
top = Counter([item for sublist in eda['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(30))
temp.columns = ['Common_words','count']

# Creating the word cloud using text from positive-sentiment tweets.

plt.figure(figsize=(15,10))
text=' '.join(eda[eda['sentiment_type']=='POSITIVE']['final_tweet'])
wordcloud=WordCloud(max_words=100, background_color='white', colormap='Greens').generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Wordcloud: Positive Tweets')
plt.show()



---



### **Word Cloud: Negative Tweets**

In [None]:
# Creating the word cloud using text from negative-sentiment tweets.

plt.figure(figsize=(15,10))
text=' '.join(eda[eda['sentiment_type']=='NEGATIVE']['final_tweet'])
wordcloud=WordCloud(max_words=100,background_color='white',colormap='Reds').generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Wordcloud: Negative Tweets')
plt.show()



---



### **Topic Analysis: Use-cases**




In [None]:
# Defining and labelling the keyword groups for analysis. 
use_cases = {'Programming': ['python', 'programming', 'coding', 'code', 'syntax', 'debug', 'debugging', 'markdown', 'error-check', 'shinyapps', 'react', 'ruby on rails'],
                  'Writing': ['writing', 'essay', 'assignment', 'write', 'stories', 'story', 'post', 'blog', 'article'],
                  'Communication': ['text', 'message', 'email', 'letter', 'memo', 'meeting'],
                  'Research': ['research', 'search', 'wiki', 'wikipedia', 'paper', 'dissertation', 'thesis'],
                  'Creativity': ['idea', 'ideas', 'poem', 'haiku', 'song', 'episode', 'rhyme', 'lyrics', 'art', 'portrait', 'rap'],
                  'Education': ['exam', 'assignment', 'essay', 'tutor', 'homework', 'undergraduate', 'student', 'teacher', 'quiz', 'papers']}

# Creating a dictionary to store the count of tweets for each group.
use_cases_counts = {}

# Looping over the keyword groups and count the number of tweets that contain any keyword in the group.
for group_label, group_keywords in use_cases.items():
    mask = df['tweet'].str.contains('|'.join(group_keywords), case=False)
    use_cases_counts[group_label] = mask.sum()

# Calculating the total number of tweets in the dataset.
total_tweets = len(df)

# Looping over the keyword groups and calculate the percentage of tweets that contain any keyword in the group.
for group_label, group_count in use_cases_counts.items():
    group_percentage = group_count / total_tweets * 100
    print(f'The "{group_label}" group appears in {group_percentage:.2f}% of the tweets.')

<p>





---



### **Topic Analysis: Use-cases > Bar chart**

In [None]:
# Creating a list of the percentage of tweets that contain each keyword group.
use_cases_percentages = [(group_label, group_count / total_tweets * 100) for group_label, group_count in use_cases_counts.items()]

# Sorting the list by percentage in descending order.
use_cases_percentages.sort(key=lambda x: x[1], reverse=True)

# Extracting the group labels and percentages into separate lists.
group_labels = [x[0] for x in use_cases_percentages]
use_cases_percentages = [x[1] for x in use_cases_percentages]

# Creating a bar chart of the group percentages.
sns.barplot(x=group_labels, y=use_cases_percentages, palette='Set2')
plt.title('Percentage of Tweets by Keyword Group', fontsize=12)
plt.xlabel('Keyword Group', fontsize=10)
plt.ylabel('Percentage of Tweets', fontsize=10)
plt.xticks(fontsize=8) #change x-axis tick label size
plt.yticks(fontsize=8) #change y-axis tick label size
plt.show()

In [None]:
#Exporting the output for visualisation.

# Define the file name for the CSV output
filename = "use_cases_group_counts.csv"

# Open the CSV file and write the header row
with open(filename, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Keyword Group", "Tweet Count"])

    # Loop through each group and write the label and count to the CSV
    for group, count in use_cases_counts.items():
        writer.writerow([group, count])



---



### **Topic Sentiment Analysis: Programming**

In [None]:
# Defining the list of keywords for the 'Programming' category. 
programming_keywords = ['python', 'programming', 'coding', 'code', 'syntax', 'debug', 'debugging', 'markdown', 'error-check', 'shinyapps', 'react', 'ruby on rails']

# Creating a new dataframe with tweets containing the keywords. 
programming = df_new[df_new['final_tweet'].str.contains('|'.join(programming_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
programming.loc[programming.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
programming.loc[programming.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
programming.loc[programming.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

programming.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Programming' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = programming.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

programming.to_csv('programming.csv', index=False)



---



### **Sentiment bar chart: Programming**

In [None]:
colors = ["#90CECC", "#E8D5A1", "#E3BAB3"]

sns.barplot(data=programming,x=programming['sentiment_type'].value_counts().index,y=programming['sentiment_type'].value_counts(), palette=colors)

plt.title('Use-case: Progamming', fontsize=12)



---



### **Topic Sentiment Analysis: Writing**

In [None]:
# Defining the list of keywords for the 'Writing' category. 
writing_keywords = ['writing', 'essay', 'assignment', 'write', 'stories', 'story', 'post', 'blog', 'article']

# Creating a new dataframe with tweets containing the keywords. 
writing = df_new[df_new['final_tweet'].str.contains('|'.join(writing_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
writing.loc[writing.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
writing.loc[writing.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
writing.loc[writing.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

writing.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Writing' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = writing.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

writing.to_csv('writing.csv', index=False)



---



### **Sentiment bar chart: Writing**

In [None]:
colors = ["#90CECC", "#E8D5A1", "#E3BAB3"]

sns.barplot(data=writing,x=writing['sentiment_type'].value_counts().index,y=writing['sentiment_type'].value_counts(), palette=colors)

plt.title('Use-case: Writing', fontsize=12)



---



### **Topic Sentiment Analysis: Communication**

In [None]:
# Defining the list of keywords for the 'Communication' category. 
communication_keywords = ['text', 'message', 'email', 'letter', 'memo', 'meeting']

# Creating a new dataframe with tweets containing the keywords. 
communication = df_new[df_new['final_tweet'].str.contains('|'.join(communication_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
communication.loc[communication.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
communication.loc[communication.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
communication.loc[communication.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

communication.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Communication' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = communication.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

communication.to_csv('communication.csv', index=False)



---



### **Sentiment bar chart: Communication**

In [None]:
colors = ["#90CECC", "#E8D5A1", "#E3BAB3"]

sns.barplot(data=communication,x=communication['sentiment_type'].value_counts().index,y=communication['sentiment_type'].value_counts(), palette=colors)

plt.title('Use-case: Communication', fontsize=12)



---



### **Topic Sentiment Analysis: Research**

In [None]:
# Defining the list of keywords for the 'Research' category. 
research_keywords = ['research', 'search', 'wiki', 'wikipedia', 'paper', 'dissertation', 'thesis']

# Creating a new dataframe with tweets containing the keywords. 
research = df_new[df_new['final_tweet'].str.contains('|'.join(research_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
research.loc[research.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
research.loc[research.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
research.loc[research.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

research.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Research' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = research.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

research.to_csv('research.csv', index=False)



---



### **Sentiment bar chart: Research**

In [None]:
colors = ["#90CECC", "#E8D5A1", "#E3BAB3"]

sns.barplot(data=research,x=research['sentiment_type'].value_counts().index,y=research['sentiment_type'].value_counts(), palette=colors)

plt.title('Use-case: Research', fontsize=12)



---



### **Topic Sentiment Analysis: Creativity**



In [None]:
# Defining the list of keywords for the 'Creativity' category. 
creativity_keywords = ['idea', 'ideas', 'poem', 'haiku', 'song', 'episode', 'rhyme', 'lyrics', 'art', 'portrait', 'rap']

# Creating a new dataframe with tweets containing the keywords. 
creativity = df_new[df_new['final_tweet'].str.contains('|'.join(creativity_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
creativity.loc[creativity.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
creativity.loc[creativity.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
creativity.loc[creativity.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

creativity.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Creativity' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = creativity.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

creativity.to_csv('creativity.csv', index=False)



---



### **Sentiment bar chart: Creativity**

In [None]:
colors = ["#90CECC", "#E8D5A1", "#E3BAB3"]

sns.barplot(data=creativity,x=creativity['sentiment_type'].value_counts().index,y=creativity['sentiment_type'].value_counts(),palette=colors)

plt.title('Use-case: Creativity', fontsize=12)



---



### **Topic Sentiment Analysis: Education**

In [None]:
# Defining the list of keywords for the 'Education' category. 
education_keywords = ['exam', 'assignment', 'essay', 'tutor', 'homework', 'undergraduate', 'student', 'teacher', 'quiz', 'papers']

# Creating a new dataframe with tweets containing the keywords. 
education = df_new[df_new['final_tweet'].str.contains('|'.join(education_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
education.loc[education.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
education.loc[education.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
education.loc[education.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

education.head()


<p>



In [None]:
#Calculating the mean sentiment for the 'Education' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = education.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

education.to_csv('education.csv', index=False)



---



### **Sentiment bar chart: Education**

In [None]:
colors = ["#90CECC", "#E3BAB3", "#E8D5A1"]

sns.barplot(data=education,x=education['sentiment_type'].value_counts().index,y=education['sentiment_type'].value_counts(),palette=colors)

plt.title('Use-case: Education', fontsize=12)



---



### **Topic Analysis: Common issues**

In [None]:
# Defining and labelling the keyword groups for analysis. 
common_issues = {'Hallucinations': ['hallucination', 'hallucinate', 'unfactual', 'incorrect', 'inaccurate', 'unverified'],
                  'Ethical implications': ['ethics', 'bias', 'ethical', 'plagiarism', 'copyright'],
                  'Disinformation': ['disinformation', 'misinformation', 'libel', 'slander', 'propaganda'],
                  'Abusive Language': ['racism', 'racist', 'misogyny', 'sexism', 'sexist', 'hate speech', 'abusive language', 'anti-semitism']}

# Creating a dictionary to store the count of tweets for each group.
common_issues_counts = {}

# Looping over the keyword groups and counting the number of tweets that contain any keyword in the group.
for group_label, group_keywords in common_issues.items():
    mask = df['tweet'].str.contains('|'.join(group_keywords), case=False)
    common_issues_counts[group_label] = mask.sum()

# Calculating the total number of tweets in the dataset.
total_tweets = len(df)

# Looping over the keyword groups and calculate the percentage of tweets that contain any keyword in the group.
for group_label, group_count in common_issues_counts.items():
    group_percentage = group_count / total_tweets * 100
    print(f'The "{group_label}" group appears in {group_percentage:.2f}% of the tweets.')

<p>





---



### **Topic Analysis: Common issues > Bar chart**

In [None]:
# Creating a list of the percentage of tweets that contain each keyword group.
common_issues_percentages = [(group_label, group_count / total_tweets * 100) for group_label, group_count in group_counts.items()]

# Sorting the list by percentage in descending order.
common_issues_percentages.sort(key=lambda x: x[1], reverse=True)

# Extracting the group labels and percentages into separate lists.
group_labels = [x[0] for x in common_issues_percentages]
common_issues_percentages = [x[1] for x in common_issues_percentages]

# Creating a bar chart of the group percentages.
sns.barplot(x=group_labels, y=common_issues_percentages, palette='Set2')
plt.title('Percentage of Tweets by Keyword Group', fontsize=12)
plt.xlabel('Keyword Group', fontsize=10)
plt.ylabel('Percentage of Tweets', fontsize=10)
plt.xticks(fontsize=8) #change x-axis tick label size
plt.yticks(fontsize=8) #change y-axis tick label size
plt.show()



---



### **Topic Sentiment Analysis: Hallucinations**



In [None]:
# Defining the list of keywords for the 'Hallucinations' category. 
hallucination_keywords = ['hallucination', 'hallucinate', 'unfactual', 'incorrect', 'inaccurate', 'unverified']

# Creating a new dataframe with tweets containing the keywords. 
hallucinations = df_new[df_new['final_tweet'].str.contains('|'.join(hallucination_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
hallucinations.loc[hallucinations.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
hallucinations.loc[hallucinations.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
hallucinations.loc[hallucinations.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

hallucinations.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Hallucinations' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = hallucinations.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

hallucinations.to_csv('hallucinations.csv', index=False)



---



### **Sentiment bar chart: Hallucinations**

In [None]:
colors = ["#90CECC", "#E3BAB3", "#E8D5A1"]

sns.barplot(data=hallucinations,x=hallucinations['sentiment_type'].value_counts().index,y=hallucinations['sentiment_type'].value_counts(),palette=colors)

plt.title('Common issues: Hallucinations', fontsize=12)



---



### **Topic Sentiment Analysis: Ethical concerns**



In [None]:
# Defining the list of keywords for the 'Ethical concerns' category. 
ethics_keywords = ['ethics', 'bias', 'ethical', 'plagiarism', 'copyright']

# Creating a new dataframe with tweets containing the keywords. 
ethics = df_new[df_new['final_tweet'].str.contains('|'.join(ethics_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
ethics.loc[ethics.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
ethics.loc[ethics.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
ethics.loc[ethics.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

ethics.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Ethical concerns' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = ethics.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

ethics.to_csv('ethics.csv', index=False)




---



### **Sentiment bar chart: Ethical concerns**

In [None]:
colors = ["#90CECC", "#E3BAB3", "#E8D5A1"]

sns.barplot(data=ethics,x=ethics['sentiment_type'].value_counts().index,y=ethics['sentiment_type'].value_counts(),palette=colors)

plt.title('Common issues: Ethical concerns', fontsize=12)



---



### **Topic Sentiment Analysis: Disinformation**



In [None]:
# Defining the list of keywords for the 'Disinformation' category. 
disinformation_keywords = ['disinformation', 'misinformation', 'libel', 'slander', 'propaganda']

# Creating a new dataframe with tweets containing the keywords. 
disinformation = df_new[df_new['final_tweet'].str.contains('|'.join(disinformation_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
disinformation.loc[disinformation.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
disinformation.loc[disinformation.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
disinformation.loc[disinformation.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

disinformation.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Disinformation' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = disinformation.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

disinformation.to_csv('disinformation.csv', index=False)




---



### **Sentiment bar chart: Disinformation**

In [None]:
colors = ["#E3BAB3", "#90CECC", "#E8D5A1"]

sns.barplot(data=disinformation,x=disinformation['sentiment_type'].value_counts().index,y=disinformation['sentiment_type'].value_counts(),palette=colors)

plt.title('Common issues: Disinformation', fontsize=12)



---



### **Topic Sentiment Analysis: Abusive Language**

In [None]:
# Defining the list of keywords for the 'Abusive Language' category. 
language_keywords = ['racism', 'racist', 'misogyny', 'sexism', 'sexist', 'hate speech', 'abusive language', 'anti-semitism']

# Creating a new dataframe with tweets containing the keywords. 
abusivelang = df_new[df_new['final_tweet'].str.contains('|'.join(language_keywords))]

# Initializing the SentimentIntensityAnalyzer from NLTK.
sid = SentimentIntensityAnalyzer()

# Applying a sentiment classification to each of the keywords in the group.  
abusivelang.loc[abusivelang.sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
abusivelang.loc[abusivelang.sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
abusivelang.loc[abusivelang.sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'

abusivelang.head()

<p>



In [None]:
#Calculating the mean sentiment for the 'Abusive Language' keyword group.

# group the rows by sentiment type and calculate the mean sentiment score for each group
mean_sentiment = abusivelang.groupby('sentiment_type')['sentiment_compound_polarity'].mean()

print(mean_sentiment)

<p>



In [None]:
# Exporting the output to for data visualisation.

abusivelang.to_csv('abusivelang.csv', index=False)




---



### **Sentiment bar chart: Abusive Language**

In [None]:
colors = ["#E3BAB3", "#90CECC", "#E8D5A1"]

sns.barplot(data=abusivelang,x=abusivelang['sentiment_type'].value_counts().index,y=abusivelang['sentiment_type'].value_counts(),palette=colors)

plt.title('Common issues: Abusive Language', fontsize=12)



---



### **Topline Engagement Metrics: Calculations for visualisation**

In [None]:
#Calculating the total like count
df_new.like_count.sum()

In [None]:
#Calculating the total quote count
df_new.quote_count.sum()

In [None]:
#Calculating the total reply count
df_new.reply_count.sum()

In [None]:
#Calculating the total retweet count
df_new.retweet_count.sum()



---

