In [6]:
import praw
import prawcore
import pandas as pd
import numpy as np 
import os
import re
import math
from collections import Counter
from itertools import chain, combinations
from more_itertools import pairwise
from tqdm import tqdm
from datetime import datetime, date
import en_core_web_sm


from spacy import load
import warnings
warnings.filterwarnings("ignore", message=r"\[W008\]", category=UserWarning)


In [3]:
from transformers import pipeline
sentiment_toxicity = pipeline("sentiment-analysis", model="unitary/toxic-bert")

tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}



In [7]:
path = 'data'
user_dir = 'user_data'
reddit = praw.Reddit(
    client_id='4bDjCY6y8ncrc3kLrBbpBg',
    client_secret='zPHZKfk9S666S9IxR5HtvkZ83ufNxw',
    user_agent='webcrawler created for IS596'
)
nlp = en_core_web_sm.load()

In [8]:
def normalize(text):
    norm_text = []

    for token in text:
        if not token.is_punct and not token.is_stop and not token.is_space:
            norm_text.append(token.lemma_.lower())

    norm_text = ' '.join(norm_text)
    norm_text = re.sub(r'(?:^| )\w(?:$| )', ' ', norm_text).strip()  # removes single characters
    norm_text = re.sub(r'[^a-zA-Z0-9 ]', '', norm_text)
    
    return norm_text

In [9]:
def get_user_status(path):
    try:
        user.is_suspended
        user_status = 'suspended'
    except AttributeError:
        user_status = 'active'
    except:
        user_status = 'deleted'
    return user_status

In [10]:
def ratio(a,b):
    a = float(a)
    b = float(b)
    if b == 0:
        return a
    else:
        return ratio(b, a % b)

In [11]:
def calculate_cosine_sim(sent_1, sent_2):
    l1 = []; l2 = []
    
    sent_1 = set(sent_1.split(' '))
    sent_2 = set(sent_2.split(' '))
    
    rvector = sent_1.union(sent_2)
    
    for word in rvector:
        if word in sent_1: l1.append(1)
        else: l1.append(0)
        if word in sent_2: l2.append(1)
        else: l2.append(0)
    
    count = 0
    
    for i in range(len(rvector)):
        count += l1[i] * l2[i]
    cosine = count/float((sum(l1) * sum(l2))**.5)
    
    return cosine

In [12]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    WORD = re.compile(r"\w+")
    words = WORD.findall(text)
    return Counter(words)

In [13]:
sent_1 = "the fox jumped over the log"
sent_2 = "the dog sat on the log"

print('original sentences:')
print(sent_1)
print(sent_2)
print()

sent_1 = normalize(nlp(sent_1))
sent_2 = normalize(nlp(sent_2))

vec_1 = text_to_vector(sent_1)
vec_2 = text_to_vector(sent_2)
print('normalized sentences:')
print (sent_1)
print (sent_2)
print()

print(f'cosine_function: {calculate_cosine_sim(sent_1, sent_2)}')
# print(f'cosine_function_2: {get_cosine(vec_1, vec_2)}')

sent_1 = nlp(sent_1)
sent_2 = nlp(sent_2)
print(f'spacy similarity function: {sent_1.similarity(sent_2)}')

original sentences:
the fox jumped over the log
the dog sat on the log

normalized sentences:
fox jump log
dog sit log

cosine_function: 0.3333333333333333
spacy similarity function: 0.7902248506898802


  print(f'spacy similarity function: {sent_1.similarity(sent_2)}')


## Main Script

In [14]:
for filename in os.scandir('data'):
    if filename.is_file():
        
        file_df = pd.read_csv(filename)
        user_names = file_df.author
        users_df = pd.DataFrame(columns=['user_name', 'comment_simularity'])
        rows = []
        
        for user_name in tqdm(user_names[:1000]):
            try:
                user_dict = {}
                user_dict['username'] = user_name
                user = reddit.redditor(user_name)

                if get_user_status(user) == 'active' and str(user_name) not in 'nan': # checks that user isn't suspended/deleted
                    if not user.is_mod: #ignore mods

                        comment_array = []
                        timestamps = []
                        reply_timestamps = []

                        try:
                            for this_comment in user.comments.new(limit=10):

                                parent_comment_id = this_comment.parent_id
                                if parent_comment_id.startswith('t3'):
                                    parent_comment_id = parent_comment_id[3:]
                                    parent = reddit.submission(parent_comment_id)
                                else:
                                    parent = reddit.comment(parent_comment_id)

                                parent_timestamp = datetime.fromtimestamp(parent.created_utc)
                                comment_timestamp = datetime.fromtimestamp(this_comment.created_utc)

                                comment_array.append(this_comment.body)
                                timestamps.append(comment_timestamp)
                                reply_timestamps.append((parent_timestamp, comment_timestamp))

                        except Exception as e:
                            print(e)
                            continue

                        for i in range(len(comment_array)):
                            comment_array[i] = normalize(nlp((comment_array[i]))) # normalizes comment but leaves as string
                            comment_array[i] = nlp(normalize(nlp((comment_array[i])))) # this normalizes comment and wraps in nlp

                        comment_similarities = []
                        time_intervals = []
                        response_intervals = []

                        for sent_1, sent_2 in combinations(comment_array, 2):

    #                         comment_similarities.append(calculate_cosine_sim(sent_1, sent_2)) checks cosine simularity of each comment against the next 
                            comment_similarities.append(sent_1.similarity(sent_2)) # nlp similiarity

                        successive_times = list(pairwise(timestamps))
                        for pair in successive_times: # calculates the intervals between user's comments
                            time_intervals.append(abs(pair[0] - pair[1]))

                        for pair in reply_timestamps:
                            response_intervals.append(abs(pair[1] - pair[0]))# calculates how quickly a comment replied to its parent

                        try:
                            time_data = pd.Series(time_intervals)
                            avg_time_diff = (time_data.sum()/len(time_data)).round('1s')
                        except Exception as e:
                            print(e)
                            continue

                        try:
                            response_data = pd.Series(response_intervals)
                            avg_reply_speed = (response_data.sum()/len(response_data)).round('1s')
                        except Exception as e:
                            print(e)
                            continue

                        try:
                            avg_comment_similarity = sum(comment_similarities)/len(comment_similarities)
                        except ZeroDivisionError:
                            continue
                        except Exception as e:
                            print(e)
                            continue
                        try:
                            user_dict['avg_comment_similarity'] = avg_comment_similarity # formating dict
                            user_dict['avg_comment_time_interval'] = avg_time_diff
                            user_dict['avg_reply_speed'] = avg_reply_speed
                            user_dict['avg_reply_speed'] = avg_reply_speed
                            user_dict['reply_speeds'] = response_intervals

                            rows.append(user_dict) 
                        except Exeception as e:
                            print(e)
                            continue
                            
            except Exception as e:
                print(e)
                continue
                    
        users_df = pd.DataFrame.from_dict(rows, orient='columns')
        print(users_df)
        
        filename = str(filename)
        
        if filename.startswith("<DirEntry 'comment"):
            users_df.to_csv(os.path.join(f'{path}/{user_dir}', f'raw_users_{filename[19:-2]}'), index=False)

        if filename.startswith("<DirEntry 'submission"):
            users_df.to_csv(os.path.join(f'{path}/{user_dir}', f'raw_users_{filename[22:-2]}'), header=False, index=False, mode='a')
                               

AttributeError: 'DataFrame' object has no attribute 'author'

## Troubleshoot

In [12]:
user = reddit.redditor('Excellenepperg')
comment_array = []
timestamps = []
reply_times = []

for this_comment in user.comments.new(limit=5):
                            
    comment_timestamp = datetime.fromtimestamp(this_comment.created_utc)
    print(f'comment id: {this_comment.id}')
    print(f'comment timestamp: {comment_timestamp}')
    parent_comment_id = this_comment.parent_id
    
    print(this_comment.body)
    if parent_comment_id.startswith('t3'):
        parent_comment_id = parent_comment_id[3:]
        parent = reddit.submission(parent_comment_id)
        print(f'parent_comment_id: {parent.id}')
    else:
        parent = reddit.comment(parent_comment_id)
        print(f'parent_comment_id: {parent.id}')
    try:
        parent_id = parent.id.lstrip('t3_')
        parent_timestamp = datetime.fromtimestamp(parent.created_utc)
        print(f'parent timestamp: {parent_timestamp}')
        print()
    except Exception as e:
        print(e)
        print()
        continue
    comment_array.append(this_comment.body)
    timestamps.append(comment_timestamp)
    reply_times.append((parent_timestamp, comment_timestamp))

print(f'reply times:')
for reply in reply_times:
    print(reply)
response_intervals = []
for pair in reply_times:
    response_intervals.append(abs(pair[1] - pair[0]))

print()
print('response intervals:')
for response in response_intervals:
    print(response)

comment id: izkdtip
comment timestamp: 2022-12-09 13:15:00
 It's so weird that they removed it. Here forehead is just empty now.
parent_comment_id: t1_izjn7a3
parent timestamp: 2022-12-09 10:27:04

comment id: izjvq2s
comment timestamp: 2022-12-09 11:21:38
Haha! Is it a foster situation or are they all hers?
parent_comment_id: t1_izie9y8
parent timestamp: 2022-12-09 03:33:18

comment id: izjvib3
comment timestamp: 2022-12-09 11:20:16
 They eventually shed the velvet.
parent_comment_id: t1_izj2w98
parent timestamp: 2022-12-09 08:05:36

comment id: izjvb46
comment timestamp: 2022-12-09 11:19:02
 We love you guys!
parent_comment_id: t1_izjcgsc
parent timestamp: 2022-12-09 09:15:07

comment id: izjv0bg
comment timestamp: 2022-12-09 11:17:08
. Although, surprisingly, it stuck last night. Caused havoc on the traffic this morning; we're rain folk.
parent_comment_id: t1_izj9l8f
parent timestamp: 2022-12-09 08:54:50

reply times:
(datetime.datetime(2022, 12, 9, 10, 27, 4), datetime.datetime(202

In [None]:
')

In [7]:
rows = []

for submission in botdefense.stream.submissions():
    try:
        user_name = submission.title[13:]

        user_dict = {}
        user_dict['username'] = user_name
        user = reddit.redditor(user_name)

        comment_karma = user.comment_karma
        submission_karma = user.link_karma

        karma_ratio = round(comment_karma/submission_karma, 4)

        comment_array = []
        timestamps = []
        reply_timestamps = []
        comment_toxicities = []


        for this_comment in user.comments.new(limit=15):

            parent_comment_id = this_comment.parent_id
            if parent_comment_id.startswith('t3'):
                parent_comment_id = parent_comment_id[3:]
                parent = reddit.submission(parent_comment_id)
            else:
                parent = reddit.comment(parent_comment_id)

            parent_timestamp = datetime.fromtimestamp(parent.created_utc)
            comment_timestamp = datetime.fromtimestamp(this_comment.created_utc)

            comment_array.append(this_comment.body)
            comment_toxicities.append(sentiment_toxicity(this_comment.body, **tokenizer_kwargs)[0]['score'])

            timestamps.append(comment_timestamp)
            reply_timestamps.append((parent_timestamp, comment_timestamp))

        for i in range(len(comment_array)):
            comment_array[i] = nlp(normalize(nlp((comment_array[i]))))

        comment_similarities = []
        time_intervals = []
        response_intervals = []

        for sent_1, sent_2 in combinations(comment_array, 2):
            comment_similarities.append(sent_1.similarity(sent_2))

        successive_times = list(pairwise(timestamps))
        for pair in successive_times: # calculates the intervals between user's comments
            time_intervals.append(abs(pair[0] - pair[1]))
        for pair in reply_timestamps:
            response_intervals.append(abs(pair[1] - pair[0]))

        time_data = pd.Series(time_intervals)
        avg_time_diff = (time_data.sum()/len(time_data)).round('1s')

        response_data = pd.Series(response_intervals)
        avg_reply_speed = (response_data.sum()/len(response_data)).round('1s')

        avg_comment_similarity = sum(comment_similarities)/len(comment_similarities)
        avg_toxicity = sum(comment_toxicities)/len(comment_toxicities)

        user_dict['comment_karma'] = comment_karma
        user_dict['submission_karma'] = submission_karma
        user_dict['karma_ratio'] = karma_ratio
        user_dict['avg_comment_similarity'] = avg_comment_similarity # formating dict
        user_dict['avg_toxicity'] = avg_toxicity
        user_dict['avg_reply_speed'] = avg_reply_speed
        user_dict['avg_comment_time_interval'] = avg_time_diff

        print(user_dict)
        rows.append(user_dict) 

    except Exception as e:
        print(e)

users_df = pd.DataFrame.from_dict(rows, orient='columns')
users_df.to_csv(f'{path}/{user_dir}/suspected_bots_2.csv')

TypeError: praw.models.listing.mixins.base.BaseListingMixin.new() got multiple values for keyword argument 'limit'