In [27]:
import praw
import prawcore
import pandas as pd
import numpy as np 
import os
import re
import math
from collections import Counter
from itertools import chain, combinations
from more_itertools import pairwise
from tqdm import tqdm
from datetime import datetime, date


from spacy import load
import warnings
warnings.filterwarnings("ignore", message=r"\[W008\]", category=UserWarning)


In [18]:
path = 'data'
user_dir = 'user_data'
reddit = praw.Reddit(
    client_id='4bDjCY6y8ncrc3kLrBbpBg',
    client_secret='zPHZKfk9S666S9IxR5HtvkZ83ufNxw',
    user_agent='webcrawler created for IS596'
)

nlp = load('en_core_web_lg')

In [4]:
def normalize(text):
    norm_text = []

    for token in text:
        if not token.is_punct and not token.is_stop and not token.is_space:
            norm_text.append(token.lemma_.lower())

    norm_text = ' '.join(norm_text)
    norm_text = re.sub(r'(?:^| )\w(?:$| )', ' ', norm_text).strip()  # removes single characters
    norm_text = re.sub(r'[^a-zA-Z0-9 ]', '', norm_text)
    
    return norm_text

In [5]:
def get_user_status(path):
    try:
        user.is_suspended
        user_status = 'suspended'
    except AttributeError:
        user_status = 'active'
    except:
        user_status = 'deleted'
    return user_status

In [6]:
def calculate_cosine_sim(sent_1, sent_2):
    l1 = []; l2 = []
    
    sent_1 = set(sent_1.split(' '))
    sent_2 = set(sent_2.split(' '))
    
    rvector = sent_1.union(sent_2)
    
    for word in rvector:
        if word in sent_1: l1.append(1)
        else: l1.append(0)
        if word in sent_2: l2.append(1)
        else: l2.append(0)
    
    count = 0
    
    for i in range(len(rvector)):
        count += l1[i] * l2[i]
    cosine = count/float((sum(l1) * sum(l2))**.5)
    
    return cosine

In [7]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    WORD = re.compile(r"\w+")
    words = WORD.findall(text)
    return Counter(words)

In [19]:
sent_1 = "the fox jumped over the log"
sent_2 = "the dog sat on the log"

print('original sentences:')
print(sent_1)
print(sent_2)
print()

sent_1 = normalize(nlp(sent_1))
sent_2 = normalize(nlp(sent_2))

vec_1 = text_to_vector(sent_1)
vec_2 = text_to_vector(sent_2)
print('normalized sentences:')
print (sent_1)
print (sent_2)
print()

print(f'cosine_function: {calculate_cosine_sim(sent_1, sent_2)}')
# print(f'cosine_function_2: {get_cosine(vec_1, vec_2)}')

sent_1 = nlp(sent_1)
sent_2 = nlp(sent_2)
print(f'spacy similarity function: {sent_1.similarity(sent_2)}')

original sentences:
the fox jumped over the log
the dog sat on the log

normalized sentences:
fox jump log
dog sit log

cosine_function: 0.3333333333333333
spacy similarity function: 0.6839218144112934


## Main Script

In [28]:
for filename in os.scandir('data'):
    if filename.is_file():
        
        file_df = pd.read_csv(filename)
        user_names = file_df.author
        users_df = pd.DataFrame(columns=['user_name', 'comment_simularity'])
        rows = []
        
        for user_name in tqdm(user_names[:1000]):
            user_dict = {}
            user_dict['username'] = user_name
            user = reddit.redditor(user_name)
            
            if get_user_status(user) == 'active' and str(user_name) not in 'nan': # checks that user isn't suspended/deleted
                if not user.is_mod: #ignore mods
                    
                    comment_array = []
                    timestamps = []
                    reply_timestamps = []
                    
                    try:
                        for this_comment in user.comments.new(limit=10):
                            
                            parent_comment_id = this_comment.parent_id
                            if parent_comment_id.startswith('t3'):
                                parent_comment_id = parent_comment_id[3:]
                                parent = reddit.submission(parent_comment_id)
                            else:
                                parent = reddit.comment(parent_comment_id)
                            
                            parent_timestamp = datetime.fromtimestamp(parent.created_utc)
                            comment_timestamp = datetime.fromtimestamp(this_comment.created_utc)
                            
                            comment_array.append(this_comment.body)
                            timestamps.append(comment_timestamp)
                            reply_timestamps.append((parent_timestamp, comment_timestamp))

                    except Exception as e:
                        print(e)
                        continue
                        
                    for i in range(len(comment_array)):
                        comment_array[i] = normalize(nlp((comment_array[i]))) # normalizes comment but leaves as string
                        comment_array[i] = nlp(normalize(nlp((comment_array[i])))) # this normalizes comment and wraps in nlp
                        
                    comment_similarities = []
                    time_intervals = []
                    response_intervals = []

                    for sent_1, sent_2 in combinations(comment_array, 2):
                        
#                         comment_similarities.append(calculate_cosine_sim(sent_1, sent_2)) checks cosine simularity of each comment against the next 
                        comment_similarities.append(sent_1.similarity(sent_2)) # nlp similiarity
                    
                    successive_times = list(pairwise(timestamps))
                    for pair in successive_times: # calculates the intervals between user's comments
                        time_intervals.append(abs(pair[0] - pair[1]))
                        
                    for pair in reply_timestamps:
                        response_intervals.append(abs(pair[1] - pair[0]))# calculates how quickly a comment replied to its parent
                        
                    time_data = pd.Series(time_intervals)
                    avg_time_diff = (time_data.sum()/len(time_data)).round('1s')
                    
                    response_data = pd.Series(response_intervals)
                    avg_reply_speed = (response_data.sum()/len(response_data)).round('1s')
                    
                    try:
                        avg_comment_similarity = sum(comment_similarities)/len(comment_similarities)
                    except ZeroDivisionError:
                        continue
                    except Exception as e:
                        print(e)
                        continue
                    
                    user_dict['avg_comment_similarity'] = avg_comment_similarity # formating dict
                    user_dict['avg_comment_time_interval'] = avg_time_diff
                    user_dict['avg_reply_speed'] = avg_reply_speed

                    rows.append(user_dict) 

        users_df = pd.DataFrame.from_dict(rows, orient='columns')
        print(users_df)
        
        filename = str(filename)
        
        if filename.startswith("<DirEntry 'comment"):
            users_df.to_csv(os.path.join(f'{path}/{user_dir}', f'users_{filename[19:-2]}'), index=False)

        if filename.startswith("<DirEntry 'submission"):
            users_df.to_csv(os.path.join(f'{path}/{user_dir}', f'users_{filename[22:-2]}'), header=False, index=False, mode='a')
                               

100%|██████████| 1000/1000 [4:37:15<00:00, 16.64s/it]  


                 username  avg_comment_similarity avg_comment_time_interval  \
0        ninernetneepneep                0.485185           0 days 02:25:30   
1      Imdatingstaceysmom                0.478584           6 days 21:24:15   
2               dirtyALEK                0.462610           1 days 10:16:53   
3        AlpacaWarMachine                0.332241           5 days 13:34:20   
4            AegonTheBest                0.267938          13 days 14:42:18   
..                    ...                     ...                       ...   
649         RoundSimbacca                0.706914           0 days 00:37:46   
650            raydiculus                0.402227           0 days 01:35:18   
651                geekxp                0.557835           1 days 23:03:29   
652  Entire_Proposal_1318                0.397180          12 days 10:02:47   
653          skinnykid108                0.250920           1 days 00:12:07   

    avg_reply_speed  
0   0 days 09:43:20  
1   0 d

100%|██████████| 1000/1000 [4:51:42<00:00, 17.50s/it]  


                username  avg_comment_similarity avg_comment_time_interval  \
0       XxSpruce_MoosexX                0.479386           0 days 05:18:37   
1    Wrong-Paramedic7489                0.284503           0 days 05:25:42   
2      renaissancetrader                0.319681           0 days 19:24:59   
3     groupthinkhivemind                0.475405           0 days 15:13:19   
4               Fatguy73                0.290340           1 days 03:21:10   
..                   ...                     ...                       ...   
697      ZachMorrisT1000                0.407356           0 days 08:14:18   
698           cRIPtoCITY                0.328947           0 days 22:56:49   
699           portoroc86                0.508008           0 days 02:45:31   
700            mrforrest                0.448366           2 days 19:00:07   
701             Colosphe                0.513201           0 days 10:31:55   

     avg_reply_speed  
0    0 days 02:13:56  
1    0 days 09:33

  time_data = pd.Series(time_intervals)
  avg_time_diff = (time_data.sum()/len(time_data)).round('1s')
 64%|██████▍   | 645/1000 [3:30:47<1:56:01, 19.61s/it]


TypeError: an integer is required (got type str)

## Troubleshoot

In [17]:
user = reddit.redditor('SignificantTrout')
comment_array = []
timestamps = []
reply_times = []

for this_comment in user.comments.new(limit=5):
                            
    comment_timestamp = datetime.fromtimestamp(this_comment.created_utc)
    print(f'comment id: {this_comment.id}')
    print(f'comment timestamp: {comment_timestamp}')
    parent_comment_id = this_comment.parent_id
    
    print(this_comment.body)
    if parent_comment_id.startswith('t3'):
        parent_comment_id = parent_comment_id[3:]
        parent = reddit.submission(parent_comment_id)
        print(f'parent_comment_id: {parent.id}')
    else:
        parent = reddit.comment(parent_comment_id)
        print(f'parent_comment_id: {parent.id}')
    try:
        parent_id = parent.id.lstrip('t3_')
        parent_timestamp = datetime.fromtimestamp(parent.created_utc)
        print(f'parent timestamp: {parent_timestamp}')
        print()
    except Exception as e:
        print(e)
        print()
        continue
    comment_array.append(this_comment.body)
    timestamps.append(comment_timestamp)
    reply_times.append((parent_timestamp, comment_timestamp))

print(f'reply times:')
for reply in reply_times:
    print(reply)
response_intervals = []
for pair in reply_times:
    response_intervals.append(abs(pair[1] - pair[0]))

print()
print('response intervals:')
for response in response_intervals:
    print(response)

comment id: iyheu0h
comment timestamp: 2022-12-01 06:37:35
Just FYI it is a mandatory recount as required by state law due to the margin. It is not expected to change the results.
parent_comment_id: z9mqfx
parent timestamp: 2022-12-01 06:34:01

comment id: iyg4kiw
comment timestamp: 2022-11-30 21:22:00
People just voting based on the Ror D beside the name
parent_comment_id: t1_iycv05a
parent timestamp: 2022-11-30 07:36:18

comment id: iyg4gb2
comment timestamp: 2022-11-30 21:21:04
Did they not do this during the normal election?
parent_comment_id: z8raxt
parent timestamp: 2022-11-30 07:19:03

comment id: iyg482w
comment timestamp: 2022-11-30 21:19:15
Didn't some Dems do also? No one is looking great on this.
parent_comment_id: z98nw4
parent timestamp: 2022-11-30 18:31:50

comment id: iyg430h
comment timestamp: 2022-11-30 21:18:06
So where are the studies for blue states and what are the death rate of unaffiliated voters?
This is a serious question. Shouldn't you have a control group ?


In [14]:
post = reddit.submission('z9mqfx')

print(post.created_utc)

1669898041.0


In [9]:
calculate_cosine_sim("the dog sat on the log", "the fox jumped over the log")

0.4