In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import spacy
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import plotly.express as px
import numpy as np


from transformers import pipeline
from google.colab import drive
from collections import Counter, defaultdict
from urllib.parse import unquote
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
if torch.cuda.is_available():
    spacy.require_gpu()
    print("Using GPU for spaCy")
else:
    print("GPU not available, using CPU for spaCy")

nlp = spacy.load("en_core_web_sm")


Using GPU for spaCy


In [4]:
def read_jsonl_file(file_path):
  """Reads a JSONL file and returns a list of dictionaries."""
  data = []
  with open(file_path, 'r') as f:
    for line in f:
      try:
        data.append(json.loads(line))
      except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
  return data

# =============================
# 1️⃣ Load Dataset
# =============================
file_path = '/content/drive/MyDrive/Perspective Paper/Original_Data/wikipedia.jsonl'
data = read_jsonl_file(file_path)

In [5]:
data

[{'id': '524288',
  'root': '524288',
  'text': 'You should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for \\"information purposes\\". ',
  'user': 'Frightner',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189190940E09'},
 {'id': '524289',
  'root': '524288',
  'text': 'Yes I agree. The law permits usage of documents, photographs and other materials for educational and informational purposes. There was a normative act issued by the government of the Republic of Macedonia that even allowed citizens to make photocopies or photograph rare archive materials. ',
  'user': 'Revizionist',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189204860E09'},
 {'id': '1',
  'root': '1',
  'text': "Yes, that's good. Revathy's page looked very reliable, that's why we used that as a source. ",
  'user': 'Johannes003',
  'meta':

In [6]:
wikipedia = defaultdict(list)

for utterance in data:
    wikipedia[utterance['root']].append(utterance)

In [7]:
data

[{'id': '524288',
  'root': '524288',
  'text': 'You should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for \\"information purposes\\". ',
  'user': 'Frightner',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189190940E09'},
 {'id': '524289',
  'root': '524288',
  'text': 'Yes I agree. The law permits usage of documents, photographs and other materials for educational and informational purposes. There was a normative act issued by the government of the Republic of Macedonia that even allowed citizens to make photocopies or photograph rare archive materials. ',
  'user': 'Revizionist',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189204860E09'},
 {'id': '1',
  'root': '1',
  'text': "Yes, that's good. Revathy's page looked very reliable, that's why we used that as a source. ",
  'user': 'Johannes003',
  'meta':

In [8]:
def clean_text(text):
    tokens = word_tokenize(text.lower())
    words = [word for word in tokens if word.isalnum()]
    words = [word for word in words if word not in stopwords.words("english")]
    return words

In [9]:
# =================
# Create Profile 1
# =================
user_texts = defaultdict(str)
for utterance in data:
    text = ''.join(utterance['text'])
    user_texts[utterance['user']] += text

top_nouns = {}
top_verbs = {}
top_adjs  = {}

all_noun_counts = {}
all_verb_counts = {}
all_adj_counts  = {}

for user, text in user_texts.items():
  doc = nlp(text)

  nouns = [token.lemma_.lower() for token in doc if token.pos_ == "NOUN" and token.is_alpha and not token.is_stop]
  verbs = [token.lemma_.lower() for token in doc if token.pos_ == "VERB" and token.is_alpha and not token.is_stop]
  adjs  = [token.lemma_.lower() for token in doc if token.pos_ == "ADJ"  and token.is_alpha and not token.is_stop]

  noun_counts = Counter(nouns)
  verb_counts = Counter(verbs)
  adj_counts  = Counter(adjs)

  top_nouns[user] = noun_counts.most_common(10)
  top_verbs[user] = verb_counts.most_common(10)
  top_adjs[user]  = adj_counts.most_common(10)



In [10]:
user_texts['Kyuubi29']

'My computer has been hacked by someone that i think is from 4 of these websites that i go to. could you please find out who it is and tell me.'

In [11]:
for utt in data:
  if utt['user']=='{unknown-49}':
    print(utt)

{'id': '264210', 'root': '264210', 'text': 'Thanks in advance17:47, 23 July 2007 (UTC)', 'user': '{unknown-49}', 'meta': {'is-admin': False}, 'reply-to': None, 'timestamp': '-1'}


In [12]:
# ============
# Profile 1: Most used 10 Noun
# ============
profile1={}
for user, noun in top_nouns.items():
  profile1[user] = [item[0]for item in noun]


In [13]:
profile1

{'Frightner': ['point',
  'template',
  'image',
  'domain',
  'propaganda',
  'upload',
  'mind',
  'source',
  'grave'],
 'Revizionist': ['photograph',
  'material',
  'photo',
  'law',
  'usage',
  'document',
  'purpose',
  'act',
  'government',
  'citizen'],
 'Johannes003': ['page',
  'site',
  'nominee',
  'idea',
  'section',
  'opening',
  'editor',
  'film',
  'source',
  'ceremony'],
 'Michael-Billa': ['fan', 'cinema', 'web', 'company', 'size', 'family'],
 'AnonEMouse': ['article',
  'page',
  'talk',
  'user',
  'source',
  'people',
  'thing',
  'image',
  'way',
  'edit'],
 'FayssalF': ['thank',
  'case',
  'time',
  'question',
  'dispute',
  'thing',
  'cheer',
  'community',
  'article',
  'group'],
 'Venerock': ['language',
  'thank',
  'contribute',
  'template',
  'horse',
  'article',
  'people',
  'edit',
  'day',
  'lady'],
 'Dana boomer': ['article',
  'page',
  'thank',
  'review',
  'talk',
  'comment',
  'editor',
  'image',
  'time',
  'source'],
 'Kudpung':

In [14]:
top_verbs

{'Frightner': [('look', 2),
  ('know', 2),
  ('say', 1),
  ('belong', 1),
  ('think', 1),
  ('stop', 1),
  ('spread', 1),
  ('greet', 1),
  ('add', 1),
  ('block', 1)],
 'Revizionist': [('agree', 2),
  ('permit', 1),
  ('issue', 1),
  ('allow', 1),
  ('photograph', 1),
  ('exclude', 1),
  ('spend', 1),
  ('prepare', 1),
  ('upload', 1),
  ('ask', 1)],
 'Johannes003': [('think', 5),
  ('know', 3),
  ('check', 3),
  ('add', 3),
  ('look', 2),
  ('remove', 2),
  ('leave', 2),
  ('find', 2),
  ('edit', 2),
  ('support', 2)],
 'Michael-Billa': [('call', 1), ('maintain', 1), ('base', 1)],
 'AnonEMouse': [('think', 128),
  ('write', 111),
  ('know', 88),
  ('look', 83),
  ('want', 74),
  ('ask', 57),
  ('find', 57),
  ('try', 57),
  ('say', 54),
  ('get', 50)],
 'FayssalF': [('need', 11),
  ('believe', 7),
  ('hope', 7),
  ('know', 6),
  ('think', 6),
  ('work', 6),
  ('find', 6),
  ('ask', 6),
  ('answer', 6),
  ('get', 6)],
 'Venerock': [('tell', 2),
  ('m', 2),
  ('think', 1),
  ('fix', 1)

In [15]:
# ============================
# Profile 2: Most used 10 verbs
# ============================

profile2={}
for user, verb in top_verbs.items():
  profile2[user] = [item[0]for item in verb]

In [17]:
profile2

38462

In [20]:
# ============================
# Profile 3: P1 + P2
# ============================
profile3 = {}
for user in profile1:
  profile3[user] = profile1[user] + profile2[user]


In [21]:
profile3

{'Frightner': ['point',
  'template',
  'image',
  'domain',
  'propaganda',
  'upload',
  'mind',
  'source',
  'grave',
  'look',
  'know',
  'say',
  'belong',
  'think',
  'stop',
  'spread',
  'greet',
  'add',
  'block'],
 'Revizionist': ['photograph',
  'material',
  'photo',
  'law',
  'usage',
  'document',
  'purpose',
  'act',
  'government',
  'citizen',
  'agree',
  'permit',
  'issue',
  'allow',
  'photograph',
  'exclude',
  'spend',
  'prepare',
  'upload',
  'ask'],
 'Johannes003': ['page',
  'site',
  'nominee',
  'idea',
  'section',
  'opening',
  'editor',
  'film',
  'source',
  'ceremony',
  'think',
  'know',
  'check',
  'add',
  'look',
  'remove',
  'leave',
  'find',
  'edit',
  'support'],
 'Michael-Billa': ['fan',
  'cinema',
  'web',
  'company',
  'size',
  'family',
  'call',
  'maintain',
  'base'],
 'AnonEMouse': ['article',
  'page',
  'talk',
  'user',
  'source',
  'people',
  'thing',
  'image',
  'way',
  'edit',
  'think',
  'write',
  'know',


In [22]:
def save_profiles_as_json(profiles, filename):
  """Saves the given profiles dictionary to a JSON file."""
  with open(filename, 'w') as f:
    json.dump(profiles, f, indent=4)


save_profiles_as_json(profile1, '/content/drive/MyDrive/Perspective Paper/Original_Data/profile1.json')
save_profiles_as_json(profile2, '/content/drive/MyDrive/Perspective Paper/Original_Data/profile2.json')
save_profiles_as_json(profile3, '/content/drive/MyDrive/Perspective Paper/Original_Data/profile3.json')
