In [205]:
import pandas as pd
import networkx as nx
import random
import ast
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from community.community_louvain import best_partition
from textblob import TextBlob
import community

## step 1: extract data

In [206]:
# def reduce_dataset(input_file, output_file, target_size=500):
#     # Load the entire dataset
#     df = pd.read_csv(input_file)

#     # Sample a subset of the dataset
#     sampled_df = df.sample(n=min(target_size, len(df)))

#     # Save the reduced dataset to a new CSV file
#     sampled_df.to_csv(output_file, index=False)

# if __name__ == "__main__":
#     # Load the training data
#     input_file = 'twitter_data.csv'
#     output_file = 'twitter_dataset.csv'
#     reduce_dataset(input_file, output_file)

df = pd.read_csv('twitter_dataset.csv')

## step 2: calculate centrality and importance criterion

In [207]:
G = nx.Graph()

for index, row in df.iterrows():
    ID = row['ID']
    friends_str = row['friends_username']
    friends_list = ast.literal_eval(friends_str)

    for friend in friends_list:
        if not G.has_node(friend):
            G.add_node(friend)
        G.add_edge(ID, friend)

# Calculate centrality measures for a subset of nodes
subset_nodes = random.sample(list(G.nodes()), min(500, len(G)))  # Select 500 random nodes or all nodes if the graph is smaller
subset_graph = G.subgraph(subset_nodes)

# Degree centrality
degree_centrality = nx.degree_centrality(subset_graph)
top_degree_nodes = sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:5]
print("Nodes with the highest degree centrality: ", top_degree_nodes)

# Betweenness centrality
betweenness_centrality = nx.betweenness_centrality(subset_graph)
top_betweenness_nodes = sorted(betweenness_centrality, key=betweenness_centrality.get, reverse=True)[:5]
print("Nodes with the highest betweenness centrality: ", top_betweenness_nodes)

# Closeness centrality
closeness_centrality = nx.closeness_centrality(subset_graph)
top_closeness_nodes = sorted(closeness_centrality, key=closeness_centrality.get, reverse=True)[:5]
print("Nodes with the highest closeness centrality: ", top_closeness_nodes)

# Eigenvector centrality
eigenvector_centrality = nx.eigenvector_centrality(subset_graph)
top_eigenvector_nodes = sorted(eigenvector_centrality, key=eigenvector_centrality.get, reverse=True)[:5]
print("Nodes with the highest eigenvector centrality: ", top_eigenvector_nodes)

# PageRank
pagerank = nx.pagerank(subset_graph)
top_pagerank_nodes = sorted(pagerank, key=pagerank.get, reverse=True)[:5]
print("Nodes with the highest PageRank: ", top_pagerank_nodes)

Nodes with the highest degree centrality:  [64098, 16495, 4756, 57030, 24386]
Nodes with the highest betweenness centrality:  [64098, 36866, 43013, 63494, 22537]
Nodes with the highest closeness centrality:  [64098, 16495, 4756, 57030, 24386]
Nodes with the highest eigenvector centrality:  [64098, 16495, 4756, 57030, 24386]
Nodes with the highest PageRank:  [64098, 16495, 4756, 57030, 24386]


## step 3-1: Preprocess and tokenize the tweets and calculate top 25 words

In [208]:
# Preprocessing function to remove special characters and convert to lowercase
def preprocess_text(text):
    cleaned_text = re.sub(r"http\S+|[^a-zA-Z\s]", "", text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text

# Tokenization function
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing and tokenization to each tweet
df["cleaned_tweet"] = df["Tweet"].apply(preprocess_text)
df["tokens"] = df["cleaned_tweet"].apply(tokenize_text)

# Concatenate all tokens into a single list
all_tokens = [token for tweet_tokens in df["tokens"] for token in tweet_tokens]

# Remove stopwords from the list of tokens
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in all_tokens if token.lower() not in stop_words]

# Calculate word frequencies
fdist = FreqDist(filtered_tokens)

# Get the 25 most common words
most_common_words = fdist.most_common(25)

most_common_words

[('chatgpt', 459),
 ('gpt', 100),
 ('ai', 94),
 ('chat', 87),
 ('like', 51),
 ('im', 30),
 ('one', 29),
 ('use', 26),
 ('get', 26),
 ('using', 25),
 ('new', 25),
 ('people', 24),
 ('ask', 24),
 ('amp', 24),
 ('openai', 21),
 ('see', 20),
 ('write', 20),
 ('good', 19),
 ('time', 19),
 ('google', 19),
 ('even', 18),
 ('think', 18),
 ('data', 17),
 ('could', 17),
 ('thing', 16)]

## step 3-2: semantic of words

In [209]:
# Download the required resources (run only once)
nltk.download('vader_lexicon')

# Create an instance of the SentimentIntensityAnalyzer class
sia = SentimentIntensityAnalyzer()

# Function to calculate the sentiment label for a word
def get_sentiment_label(word):
    sentiment_scores = sia.polarity_scores(word)
    compound_score = sentiment_scores['compound']
    if compound_score > 0:
        return "positive"
    elif compound_score < 0:
        return "negative"
    else:
        return "neutral"

# Apply the get_sentiment_label function to each word in the dataset
df["sentiment_label"] = df["tokens"].apply(lambda tokens: [get_sentiment_label(word) for word in tokens])

# Count the number of negative, positive, and neutral words
negative_count = sum(df["sentiment_label"].apply(lambda labels: labels.count("negative")))
positive_count = sum(df["sentiment_label"].apply(lambda labels: labels.count("positive")))
neutral_count = sum(df["sentiment_label"].apply(lambda labels: labels.count("neutral")))

print("word semantic count in all of the tweets:")
print("Negative words:", negative_count)
print("Positive words:", positive_count)
print("Neutral words:", neutral_count)

word semantic count in all of the tweets:
Negative words: 215
Positive words: 542
Neutral words: 9532


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## step 3-3: semantic of words for top 25 words

In [210]:
# Download the required resources (run only once)
nltk.download('vader_lexicon')

# Create an instance of the SentimentIntensityAnalyzer class
sia = SentimentIntensityAnalyzer()

# Function to calculate the sentiment label for a word
def get_sentiment_label(word):
    sentiment_scores = sia.polarity_scores(word)
    compound_score = sentiment_scores['compound']
    if compound_score > 0:
        return "positive"
    elif compound_score < 0:
        return "negative"
    else:
        return "neutral"

# Apply the get_sentiment_label function to each word in the most common words
most_common_sentiments = []
for word, frequency in most_common_words:
    sentiment_label = get_sentiment_label(word)
    most_common_sentiments.append((word, frequency, sentiment_label))

# Count the number of negative, positive, and neutral words among the most common words
negative_count = sum(1 for _, _, sentiment in most_common_sentiments if sentiment == "negative")
positive_count = sum(1 for _, _, sentiment in most_common_sentiments if sentiment == "positive")
neutral_count = sum(1 for _, _, sentiment in most_common_sentiments if sentiment == "neutral")

# Print the words in each sentiment group
negative_words = [word for word, _, sentiment in most_common_sentiments if sentiment == "negative"]
positive_words = [word for word, _, sentiment in most_common_sentiments if sentiment == "positive"]
neutral_words = [word for word, _, sentiment in most_common_sentiments if sentiment == "neutral"]

print("word semantic count in the most common words:")
print("Negative words:", negative_count)
print(negative_words)
print()
print("Positive words:", positive_count)
print(positive_words)
print()
print("Neutral words:", neutral_count)
print(neutral_words)

word semantic count in the most common words:
Negative words: 0
[]

Positive words: 2
['like', 'good']

Neutral words: 23
['chatgpt', 'gpt', 'ai', 'chat', 'im', 'one', 'use', 'get', 'using', 'new', 'people', 'ask', 'amp', 'openai', 'see', 'write', 'time', 'google', 'even', 'think', 'data', 'could', 'thing']


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## step 4-1: identifying 20 active users

In [211]:
# Calculate the total engagement for each user
df['TotalEngagement'] = df['QuoteCount'] + df['LikeCount'] + df['RetweetCount'] + df['ReplyCount']

# Sort the DataFrame by total engagement in descending order
df_sorted = df.sort_values(by='TotalEngagement', ascending=False)

# Filter the DataFrame for TotalEngagement greater than 0
active_users = df[df['TotalEngagement'] > 20]

# Extract the Username and ID columns into active_users
active_users = active_users[['Username', 'ID', 'TotalEngagement']]

print(active_users)

       Username     ID  TotalEngagement
15   kayng83346  44906               81
34   nfswv10740  23324               29
75   vxlju50938  25896              201
102  xdapa81984   1608               23
152  qyizz78556  39400               27
202  ivgbl54788  45970               54
203  szmnv56075  16395               21
234  tfpgz03850  24602               55
267  pnrqu00608  54532              206
341  oezep61822  29930               31
363  yjquz01947  56157             1316
368  gwbxx00775  51742               29
370  zduap99284  23643               21
379  nvrac18294  56661              544
382  obpnw19461   7212               34
447  ghshu68007  14821               21
478  mxmzm77440  17473               34
488  vnoec46240  33586               29
499  eihwf92101  38765               21


## step 4-2: identifying 15 top active users and their influence on the network

In [212]:
# Calculate the content impression for each user
df_sorted['ContentImpression'] = df_sorted['TotalEngagement'] * df_sorted['user_friends']

# Select the top 15 active users
top_users = df_sorted.head(15)

# Print the content impression of each user and the total content impression
print("Content Impression of Top 15 Active Users:")
print(top_users[['Username', 'ID', 'ContentImpression']])

Content Impression of Top 15 Active Users:
       Username     ID  ContentImpression
363  yjquz01947  56157              69748
379  nvrac18294  56661              17408
267  pnrqu00608  54532              47998
75   vxlju50938  25896              92259
15   kayng83346  44906              11016
234  tfpgz03850  24602               6380
202  ivgbl54788  45970               9936
478  mxmzm77440  17473               1326
382  obpnw19461   7212              11050
341  oezep61822  29930              14787
488  vnoec46240  33586                377
34   nfswv10740  23324               6467
368  gwbxx00775  51742                464
152  qyizz78556  39400              10989
102  xdapa81984   1608               4922


## step 5: identifying active communities

In [213]:
# Convert the graph to undirected if needed
G = G.to_undirected()

# Run the Louvain algorithm
partition = best_partition(G)

# Get the number of communities
num_communities = max(partition.values()) + 1

# Print the communities
for community_id in range(num_communities):
    nodes_in_community = [node for node, comm_id in partition.items() if comm_id == community_id]
    print(f"Community {community_id}: {nodes_in_community}")

Community 0: [37502, 26665, 58442, 48646, 17017, 46948, 30435, 4715, 49402, 59605, 34202, 43674, 61584, 4332, 20689, 59580, 37411, 3905, 65410, 17164, 53704, 59588, 34731, 63950, 10911, 10907, 23641, 17464, 24128, 48836, 20354, 2021, 23997, 19776, 50538, 28215, 20030, 6732, 57493, 46910, 4905, 12662, 30229, 14337, 2661, 27812, 50146, 36600, 29104, 10832, 10396, 58439, 45392, 18071, 59674, 51978, 63493, 62962, 15536, 18868, 26866, 61944, 3071, 22172, 45080, 13595, 27667, 6265, 36868, 29758, 20500, 46081, 59606, 53671, 35031, 1783, 1384, 38303, 31396, 45777, 14664, 26065, 20286, 12780, 49883, 58850, 40989, 33124, 27832, 9750, 33505, 10177, 52273, 8488, 13108, 34422, 24531, 62743, 31246, 36795, 21469, 41368, 59250, 63500, 62195, 377, 16280, 49352, 29001, 43590, 34585, 22702, 38109, 4064, 19554, 43765, 10508, 34223, 36189, 62287, 17006, 58040, 24690, 11398, 4825, 62662, 54812, 1692, 13255, 12456, 61056, 45897, 58816, 49264, 38229, 40015, 55628, 51322, 9697, 56015, 39242, 41828, 64707, 1422

## optional step-2: polarity of the network 

In [214]:
# Iterate over each tweet
polarity_list = []
for tweet in df['Tweet']:
    # Calculate polarity
    blob = TextBlob(tweet)
    polarity = blob.sentiment.polarity
    
    # Append polarity to the list
    polarity_list.append(polarity)

# Calculate overall polarity
overall_polarity = sum(polarity_list) / len(polarity_list)

# Print overall polarity
print("Overall Polarity:", overall_polarity)

Overall Polarity: 0.10818496035361659


## optional step-2: overall feeling of the network 

In [215]:
# Define the mapping dictionary
sentiment_mapping = {'negative': -1, 'neutral': 0, 'positive': 1}

# Convert 'sentiment_label' column to strings
df['sentiment_label'] = df['sentiment_label'].astype(str)

# Calculate the sentiment score for each row, handling missing values
df['sentiment_score'] = df['sentiment_label'].map(sentiment_mapping).fillna(0)

# Calculate the overall sentiment score
overall_sentiment_score = df['sentiment_score'].mean()

# Determine the overall feeling based on the sentiment score
if overall_sentiment_score < 0:
    overall_feeling = 'Negative'
elif overall_sentiment_score > 0:
    overall_feeling = 'Positive'
else:
    overall_feeling = 'Neutral'

print("Overall feeling: {}".format(overall_feeling))

Overall feeling: Neutral


## optional step-3: number of clusters in the network 

In [216]:
# Use the Louvain method to detect communities
partition = community.best_partition(G)

# Count the number of unique communities
num_clusters = len(set(partition.values()))

print("Number of clusters:", num_clusters)

Number of clusters: 83
