In [1]:
import json
import spacy
import pandas as pd
import plotly.express as px
import numpy as np

from collections import Counter
from collections import Counter

In [None]:
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
def read_jsonl_file(file_path):
  """Reads a JSONL file and returns a list of dictionaries."""
  data = []
  with open(file_path, 'r') as f:
    for line in f:
      try:
        data.append(json.loads(line))
      except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
  return data

# =============================
# 1️⃣ Load Dataset
# =============================
file_path = '/home/maryam/llamaPersonaResp/Original_Data/wikipedia.jsonl'
data = read_jsonl_file(file_path)
df = pd.DataFrame(data)
df["user"] = df["user"].astype(str)

In [4]:
data

[{'id': '524288',
  'root': '524288',
  'text': 'You should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for \\"information purposes\\". ',
  'user': 'Frightner',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189190940E09'},
 {'id': '524289',
  'root': '524288',
  'text': 'Yes I agree. The law permits usage of documents, photographs and other materials for educational and informational purposes. There was a normative act issued by the government of the Republic of Macedonia that even allowed citizens to make photocopies or photograph rare archive materials. ',
  'user': 'Revizionist',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189204860E09'},
 {'id': '1',
  'root': '1',
  'text': "Yes, that's good. Revathy's page looked very reliable, that's why we used that as a source. ",
  'user': 'Johannes003',
  'meta':

In [5]:
# =============================
# 2️⃣ Unique Number of Users
# =============================
user_ids = set(item['user'] for item in data)
print(f"Number of unique users: {len(user_ids)}")

Number of unique users: 38462


In [6]:
# =============================
# 3️⃣ Frequency of Turns per User
# =============================
user_message_counts = Counter(entry['user'] for entry in data)
print(user_message_counts)

users , message_counts = [], []
for user, count in user_message_counts.items():
    if count > 500:
      users.append(user)
      message_counts.append(count)

sorted_users, sorted_message_counts = zip(*sorted(zip(users, message_counts), key=lambda x: x[1], reverse=True))

g_2500, g_1500, g_2000, g_1000, g_500 = [], [], [], [], []
for user, count in zip(sorted_users,sorted_message_counts):
  if count > 2500:
    g_2500.append(user)
  elif count > 2000:
    g_2000.append(user)
  elif count > 1500:
    g_1500.append(user)
  elif count> 1000:
    g_1000.append(user)
  else:
    g_500.append(user)


y = [len(g_2500), len(g_2000), len(g_1500), len(g_1000),len(g_500)]
x = ["2500", "2000", "1500","1000","500"]

fig = px.bar(x=x, y=y, labels={'x': 'greater than', 'y': 'Message frequency'})
fig.show()

Counter({'Acalamari': 2743, 'Awadewit': 2315, 'Dave souza': 2069, 'Mattisse': 1892, 'SatuSuro': 1687, 'Orderinchaos': 1653, 'Carcharoth': 1648, 'wknight94': 1461, 'MONGO': 1414, 'GTBacchus': 1111, 'Ceoil': 1051, 'Royalbroil': 986, 'JHunterJ': 972, 'Modernist': 949, 'SandyGeorgia': 932, 'Stephan Schulz': 834, 'Baseball Bugs': 825, 'Kumioko': 791, '(aeropagitica)': 745, 'Fvasconcellos': 727, 'Splash': 721, 'tone': 716, 'AmiDaniel': 697, 'TexasAndroid': 676, 'The JPS': 674, 'Bongwarrior': 651, 'Paul August': 624, 'Erwin85Bot': 620, 'Sarah': 598, 'JoJan': 595, 'Malleus_Fatuorum': 574, 'Arcadian': 566, 'ElKevbo': 560, 'Viriditas': 558, 'Ktr101': 552, 'Kelly': 541, 'AnonEMouse': 537, 'Ckatz': 537, 'Dineshkannambadi': 527, 'Baseball_Bugs': 513, 'Nyttend': 511, 'Drmies': 508, 'Huaiwei': 504, 'CactusWriter': 504, 'WesleyDodds': 499, 'Sesshomaru': 498, 'Wehwalt': 496, 'Threeafterthree': 492, 'Moonriddengirl': 490, 'Tvoz': 490, 'LordAmeth': 487, 'The Haunted Angel': 476, 'The Rambling Man': 458, 

In [None]:
# =============================
# 4️⃣ Top 15 Active Users
# =============================
top_15_users = user_message_counts.most_common(10)
print("\nTop 15 Active Users:\n", top_15_users)         
users , message_counts = [], []
for item in top_15_users:
    users.append(item[0])
    message_counts.append(item[1])

fig = px.bar(x=users, y=message_counts, labels={'x': 'User', 'y': 'Message Count'}, title="Message Frequency Per User")
fig.show()

In [None]:
top_15_users = [user[0] for user in top_15_users]

In [None]:
from collections import Counter

# =============================
# 5️⃣ Extract Top 5 Most Common Nouns, Verbs, and Adjectives from Top 15 Users
# =============================
df_top_users = df[df["user"].isin(top_15_users)]

def clean_text(text):
    tokens = word_tokenize(text.lower())
    words = [word for word in tokens if word.isalnum()]
    words = [word for word in words if word not in stopwords.words("english")]
    return words



# Group messages by user
user_texts = df_top_users.groupby("user")["text"].apply(lambda x: " ".join(x)).reset_index()

# Initialize dicts
top_5_nouns = {}
top_5_verbs = {}
top_5_adjs  = {}

all_noun_counts = {}
all_verb_counts = {}
all_adj_counts  = {}

for _, row in user_texts.iterrows():
    user = row["user"]
    text = row["text"]

    doc = nlp(text)

    nouns = [token.lemma_.lower() for token in doc if token.pos_ == "NOUN" and token.is_alpha and not token.is_stop]
    verbs = [token.lemma_.lower() for token in doc if token.pos_ == "VERB" and token.is_alpha and not token.is_stop]
    adjs  = [token.lemma_.lower() for token in doc if token.pos_ == "ADJ"  and token.is_alpha and not token.is_stop]

    noun_counts = Counter(nouns)
    verb_counts = Counter(verbs)
    adj_counts  = Counter(adjs)

    top_5_nouns[user] = noun_counts.most_common(10)
    top_5_verbs[user] = verb_counts.most_common(10)
    top_5_adjs[user]  = adj_counts.most_common(10)

    all_noun_counts[user] = noun_counts
    all_verb_counts[user] = verb_counts
    all_adj_counts[user]  = adj_counts

# ✅ Now you have:
# - top_5_nouns, top_5_verbs, top_5_adjs: dicts of top 5 words for each POS per user
# - all_noun_counts, all_verb_counts, all_adj_counts: full counts

# Optional: print results
print(top_5_nouns)
print(top_5_verbs)
print(top_5_adjs)




In [None]:
def save_pos_tags_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

save_pos_tags_to_json(top_5_nouns, '/content/drive/MyDrive/Perspective Paper/Original_Data/nouns.json')
save_pos_tags_to_json(top_5_verbs, '/content/drive/MyDrive/Perspective Paper/Original_Data/verbs.json')
save_pos_tags_to_json(top_5_adjs, '/content/drive/MyDrive/Perspective Paper/Original_Data/adjs.json')


In [None]:
def heatMap(zPrime,xPrime,yPrime,k,l,til,xtil,ytil):
    import plotly.offline as pyo
    import plotly.graph_objs as go
    from plotly.graph_objs import Bar , Layout
    trace = go.Heatmap(z=zPrime, x=xPrime, y=yPrime)
    data=[trace]
    pyo.iplot({ "data": data, "layout": Layout(title = til, xaxis = dict(title = xtil, dtick = k ),
                                                            yaxis = dict(title = ytil, dtick = l ),
                                                            font =dict(size = 15))})

In [None]:
def prepare_heatmap_data_for_nouns(top_5_nouns, all_noun_counts):
    yPrime = list(top_5_nouns.keys())  # Now 'yPrime' is the list of users

    all_nouns = set()
    for nouns in top_5_nouns.values():
        all_nouns.update([noun for noun, _ in nouns])

    xPrime = list(all_nouns)  # 'xPrime' is the list of words (nouns, adjectives, or verbs)

    zPrime = np.zeros((len(yPrime), len(xPrime)))  # Adjust the shape accordingly
    word_to_idx = {word: idx for idx, word in enumerate(xPrime)}  # Map words to indices

    for i, user in enumerate(yPrime):
        user_noun_counts = dict(top_5_nouns.get(user, []))
        for word, _ in top_5_nouns[user]:
            word_idx = word_to_idx[word]
            zPrime[i, word_idx] = user_noun_counts.get(word, 0)  # Fill the matrix correctly

        for word in xPrime:
            if word not in user_noun_counts:
                zPrime[i, word_to_idx[word]] = all_noun_counts[user].get(word, 0)  # Corrected index positions

    return zPrime, xPrime, yPrime

zPrime, xPrime, yPrime = prepare_heatmap_data_for_nouns(top_5_nouns, all_noun_counts)
heatMap(zPrime, xPrime, yPrime, k=1, l=1, til="Top Nouns for Users", xtil="Top 5 Nouns", ytil="Users")

zPrime, xPrime, yPrime = prepare_heatmap_data_for_nouns(top_5_adjs, all_adj_counts)
heatMap(zPrime, xPrime, yPrime, k=1, l=1, til="Top Adjs for Users", xtil="Top 5 Adj", ytil="Users")

zPrime, xPrime, yPrime = prepare_heatmap_data_for_nouns(top_5_verbs, all_verb_counts)
heatMap(zPrime, xPrime, yPrime, k=1, l=1, til="Top Verbs for Users", xtil="Top 5 Verbs", ytil="Users")


In [None]:
def prepare_heatmap_data_for_nouns(top_5_nouns, all_noun_counts):
    xPrime = list(top_5_nouns.keys())

    all_nouns = set()
    for nouns in top_5_nouns.values():
        all_nouns.update([noun for noun, _ in nouns])

    yPrime = list(all_nouns)

    zPrime = np.zeros((len(yPrime), len(xPrime)))
    word_to_idx = {word: idx for idx, word in enumerate(yPrime)}


    for i, user in enumerate(xPrime):
        user_noun_counts = dict(top_5_nouns.get(user, []))
        for word, _ in top_5_nouns[user]:
            word_idx = word_to_idx[word]
            zPrime[word_idx, i] = user_noun_counts.get(word, 0)

        for word in yPrime:
            if word not in user_noun_counts:
                zPrime[word_to_idx[word], i] = all_noun_counts[user].get(word, 0)

    return zPrime, xPrime, yPrime

zPrime, xPrime, yPrime = prepare_heatmap_data_for_nouns(top_5_nouns:dict, all_noun_counts:dict)
heatMap(zPrime, xPrime, yPrime, k=1, l=1, til="Top Nouns for Users", xtil="Users", ytil="Top 5 Nouns")

zPrime, xPrime, yPrime = prepare_heatmap_data_for_nouns(top_5_adjs, all_adj_counts)
heatMap(zPrime, xPrime, yPrime, k=1, l=1, til="Top Adj for Users", xtil="Users", ytil="Top 5 Adj")

zPrime, xPrime, yPrime = prepare_heatmap_data_for_nouns(top_5_verbs, all_verb_counts)
heatMap(zPrime, xPrime, yPrime, k=1, l=1, til="Top Verb for Users", xtil="Users", ytil="Top 5 Verb")



In [None]:
# =============================
# 6️⃣ Tokenize Texts for Each User
# =============================

df["tokens"] = df["text"].apply(lambda x: word_tokenize(x.lower()))
print("\nTokenized Texts:\n", df[["user", "tokens"]].head())

In [None]:
user_messages = df.groupby("user")["text"].apply(list).to_dict()

In [None]:
# =============================
# 7️⃣ Sentiment Analysis Per Turn Per User
# =============================
# user_sentiment = LOAD FROM THE PROFILE FILE
# df_plot = sentiment_stats.reset_index().melt(id_vars="user", var_name="Sentiment", value_name="Count")

# # Step 7: Plot Stacked Bar Chart
# fig = px.bar(df_plot,
#              x="user", y="Count", color="Sentiment",
#              title="Sentiment Analysis for Top 15 Users",
#              labels={"user": "Users", "Count": "Number of Messages"},
#              color_discrete_map={"positive": "green", "neutral": "gray", "negative": "red"},
#              barmode="stack")

# fig.show()