In [1]:
import praw
import prawcore
import pandas as pd
import numpy as np 
import glob
import os
import re
import math
import sklearn

from collections import Counter
from itertools import chain, combinations, pairwise
from tqdm import tqdm
from datetime import datetime, date

from spacy import load
import warnings
warnings.filterwarnings("ignore", message=r"\[W008\]", category=UserWarning)



In [2]:
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis")
sentiment_toxicity = pipeline("sentiment-analysis", model="unitary/toxic-bert")

tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [3]:
path = 'data'
user_dir = 'user_data/user_networks'
reddit = praw.Reddit(
    client_id='4bDjCY6y8ncrc3kLrBbpBg',
    client_secret='zPHZKfk9S666S9IxR5HtvkZ83ufNxw',
    user_agent='webcrawler created for IS596'
)
nlp = load('en_core_web_md')

In [4]:
def normalize(text):
    norm_text = []

    for token in text:
        if not token.is_punct and not token.is_stop and not token.is_space:
            norm_text.append(token.lemma_.lower())

    norm_text = ' '.join(norm_text)
    norm_text = re.sub(r'(?:^| )\w(?:$| )', ' ', norm_text).strip()  # removes single characters
    norm_text = re.sub(r'[^a-zA-Z0-9 ]', '', norm_text)
    
    return norm_text

In [5]:
def get_user_status(path):
    try:
        user.is_suspended
        user_status = 'suspended'
    except AttributeError:
        user_status = 'active'
    except:
        user_status = 'deleted'
    return user_status

In [6]:
infile_name = f'{path}/network_list.csv'
infile = open(infile_name, 'r')
file_df = pd.read_csv(infile)
print(file_df)

                                                   0
0  ['Lord-Sprinkles', 'Final_Exit92', 'anonymous_...
1  ['Lord-Sprinkles', 'eaglesnation11', 'ninernet...
2  ['BinyaminDelta', 'frugal_masturbater', 'BackO...
3  ['BinyaminDelta', 'AutoModerator', 'XxSpruce_M...
4  ['bichonista', 'dunnkw', 'kingmoobot', 'Bloopi...
5  ['bichonista', '3.32299E+13', 'PeopleCanBeThis...
6  ['ummmbacon', 'nosecohn', 'canekicker', 'chx_'...
7  ['ummmbacon', 'nosecohn', 'bpetersonlaw', 'ext...


In [7]:
network_list = []

for i in range(len(file_df)):
    for j in range(len(file_df.iloc[[i]])):
        clean_user_names = []
        user_names = file_df.iloc[i, j]
        user_names = user_names.split(',')
        for user_name in user_names:
            user_name = user_name.strip("][' ")
            clean_user_names.append(user_name)
            
        network_list.append(clean_user_names)
        
network_list

[['Lord-Sprinkles',
  'Final_Exit92',
  'anonymous_lighting',
  'Kitsunezaki',
  'bleedingjim',
  'Jos_Meid',
  'Calm-Hovercraft9858',
  'bpatches701',
  'joculator',
  'No_Key4143',
  'BillyQz',
  'f1sh98',
  'kingbankai',
  'FireNETwork',
  'FiendishPole',
  'Jizzlobber42',
  'KnowledgeAndFaith',
  'FelderMan25',
  'HKatzOnline',
  'Bejeweled_Bird',
  'DLDabber',
  'BanzaKongo',
  'iwantnews1',
  'freeneedle',
  'Lola2050',
  'JoeBroski09',
  'KeyStep8',
  'C4RP3_N0CT3M',
  'Torque_Bow',
  'Steak_N_Cocunuts',
  'Ralath0n',
  'IncelWomenRespecter',
  'JustaCodfish',
  'BathWifeBoo',
  'TruthfulTrolling',
  '12161986',
  'AFishNamedFreddie',
  'humanglove',
  'DeliciousPussyNectar',
  'skyrne_isk',
  'MrColepuck',
  'ancilla1998',
  'TankerD18',
  'Cinnadillo',
  'SmoothSecond',
  'Icylibrium',
  'Spartanlegion117',
  'flopisit',
  'TheSecond48',
  'ImAMaaanlet',
  'Shirley-Eugest',
  'sleeknub',
  'officialwipe',
  'NA_DeltaWarDog',
  'ThirdRook',
  'Hydrocoded',
  'Morganbanefort'],


In [14]:
count = 0
for this_list in network_list:
    users_df = pd.DataFrame()
    rows = []
    for user_name in tqdm(this_list[:500]):
        user_dict = {}
        user_dict['username'] = user_name
        user = reddit.redditor(user_name)
        try:
            if get_user_status(user) == 'active' and str(user_name) not in 'nan': # checks that user isn't suspended/deleted
                if not user.is_mod: #ignore mods

                    comment_array = []
                    timestamps = []
                    reply_timestamps = []
                    comment_toxicities = []

                    try:
                        for this_comment in user.comments.new(limit=15):

                            parent_comment_id = this_comment.parent_id
                            if parent_comment_id.startswith('t3'):
                                parent_comment_id = parent_comment_id[3:]
                                parent = reddit.submission(parent_comment_id)
                            else:
                                parent = reddit.comment(parent_comment_id)

                            parent_timestamp = datetime.fromtimestamp(parent.created_utc)
                            comment_timestamp = datetime.fromtimestamp(this_comment.created_utc)

                            comment_array.append(this_comment.body)
                            comment_toxicities.append(sentiment_toxicity(this_comment.body, **tokenizer_kwargs)[0]['score'])
                            
                            timestamps.append(comment_timestamp)
                            reply_timestamps.append((parent_timestamp, comment_timestamp))

                    except Exception as e:
                        print(e)
                        continue

                    for i in range(len(comment_array)):
#                         comment_array[i] = normalize(nlp((comment_array[i]))) # normalizes comment but leaves as string
                        comment_array[i] = nlp(normalize(nlp((comment_array[i])))) # this normalizes comment and wraps in nlp

                    comment_similarities = []
                    time_intervals = []
                    response_intervals = []

                    for sent_1, sent_2 in combinations(comment_array, 2):

#                         comment_similarities.append(calculate_cosine_sim(sent_1, sent_2)) checks cosine simularity of each comment against the next 
                        comment_similarities.append(sent_1.similarity(sent_2)) # nlp similiarity

                    successive_times = list(pairwise(timestamps))
                    for pair in successive_times: # calculates the intervals between user's comments
                        time_intervals.append(abs(pair[0] - pair[1]))

                    for pair in reply_timestamps:
                        response_intervals.append(abs(pair[1] - pair[0]))# calculates how quickly a comment replied to its parent

                    try:
                        time_data = pd.Series(time_intervals)
                        avg_time_diff = (time_data.sum()/len(time_data)).round('1s')
                    except Exception as e:
                        print(e)
                        continue

                    try:
                        response_data = pd.Series(response_intervals)
                        avg_reply_speed = (response_data.sum()/len(response_data)).round('1s')
                    except Exception as e:
                        print(e)
                        continue

                    try:
                        avg_comment_similarity = sum(comment_similarities)/len(comment_similarities)
                        avg_toxicity = sum(comment_toxicities)/len(comment_toxicities)
                    except ZeroDivisionError:
                        continue
                    except Exception as e:
                        print(e)
                        continue
                    try:
                        user_dict['avg_comment_similarity'] = avg_comment_similarity # formating dict
                        user_dict['avg_toxicity'] = avg_toxicity
                        user_dict['avg_reply_speed'] = avg_reply_speed
                        user_dict['avg_comment_time_interval'] = avg_time_diff
                        user_dict['comment_intervals'] = time_intervals
                        

                        rows.append(user_dict) 
                    except Exeception as e:
                        print(e)
                        continue

        except Exception as e:
            print(e)
            continue

    users_df = pd.DataFrame.from_dict(rows, orient='columns')
    print(users_df)
    
    if count == 0:
        print(count)
        users_df.to_csv(f'{path}/{user_dir}/conservative_strong.csv')
        count += 1
    elif count == 1:
        print(count)
        users_df.to_csv(f'{path}/{user_dir}/conservative_weak.csv')
        count += 1
    elif count == 2:
        print(count)
        users_df.to_csv(f'{path}/{user_dir}/conspiracy_strong.csv')
        count += 1
    elif count == 3:
        print(count)
        users_df.to_csv(f'{path}/{user_dir}/conspiracy_weak.csv')
        count += 1
    elif count == 4:
        print(count)
        users_df.to_csv(f'{path}/{user_dir}/worldnews_strong.csv')
        count += 1
    elif count == 5:
        print(count)
        users_df.to_csv(f'{path}/{user_dir}/worldnews_weak.csv')
        count += 1
    elif count == 6:
        print(count)
        users_df.to_csv(f'{path}/{user_dir}/neutralpolitics_strong.csv')
        count += 1
    else:
        print(count)
        users_df.to_csv(f'{path}/{user_dir}/neutralpolitics_weak.csv')

100%|██████████████████████████████████████████████████████████████████████████████████| 57/57 [24:41<00:00, 26.00s/it]


                username  avg_comment_similarity  avg_toxicity  \
0         Lord-Sprinkles                0.543231      0.061667   
1     anonymous_lighting                0.515277      0.152353   
2            Kitsunezaki                0.359128      0.039366   
3            bleedingjim                0.493410      0.146145   
4               Jos_Meid                0.675106      0.110253   
5    Calm-Hovercraft9858                0.291579      0.086737   
6            bpatches701                0.601845      0.107644   
7             No_Key4143                0.535140      0.022430   
8                BillyQz                0.682721      0.112801   
9           FiendishPole                0.579946      0.056294   
10          Jizzlobber42                0.527861      0.088315   
11           HKatzOnline                0.716387      0.010860   
12              DLDabber                0.351247      0.153454   
13            BanzaKongo                0.546565      0.219309   
14        

 89%|█████████████████████████████████████████████████████████████████████▌        | 446/500 [3:21:12<16:52, 18.76s/it]

error with request HTTPSConnectionPool(host='oauth.reddit.com', port=443): Max retries exceeded with url: /comments/xxe5wl/?limit=2048&sort=confidence&raw_json=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000165A6F44310>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))


100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [3:49:30<00:00, 27.54s/it]


                username  avg_comment_similarity  avg_toxicity  \
0         Lord-Sprinkles                0.543231      0.061667   
1       ninernetneepneep                0.480807      0.255682   
2     Imdatingstaceysmom                0.520881      0.101658   
3              dirtyALEK                0.581514      0.003297   
4       AlpacaWarMachine                0.471238      0.010914   
..                   ...                     ...           ...   
363      aproudfatherof3                0.283717      0.179715   
364          kraotic8321                0.610576      0.112414   
365           biomech120                0.674826      0.061274   
366    lordoftheeyes2020                0.498657      0.274536   
367  i_have_a_pet_turkey                0.635723      0.092755   

    avg_reply_speed avg_comment_time_interval  \
0   0 days 04:53:15           0 days 03:32:11   
1   0 days 06:18:10           0 days 03:26:36   
2   0 days 07:35:35           5 days 05:15:53   
3   0 days 

100%|████████████████████████████████████████████████████████████████████████████████| 122/122 [43:28<00:00, 21.38s/it]


                username  avg_comment_similarity  avg_toxicity  \
0          BinyaminDelta                0.583534      0.024003   
1         BackOnThrottle                0.675105      0.001300   
2                 arnott                0.542244      0.015380   
3    Aggressive_Egg_9661                0.459175      0.297070   
4   Ok-Entrepreneur-4466                0.458836      0.248836   
..                   ...                     ...           ...   
75         Shanguerrilla                0.721180      0.111688   
76             poopscarf                0.470599      0.001523   
77      itspronouncedDRL                0.594191      0.078408   
78         ARocketToMars                0.724479      0.048917   
79           nalydpsycho                0.531541      0.128150   

   avg_reply_speed avg_comment_time_interval  \
0  0 days 07:36:47           0 days 15:40:23   
1  0 days 07:30:03           7 days 08:27:33   
2  0 days 06:52:47           0 days 03:44:51   
3  0 days 08:31

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [3:39:55<00:00, 26.39s/it]


                username  avg_comment_similarity  avg_toxicity  \
0          BinyaminDelta                0.583534      0.024003   
1       XxSpruce_MoosexX                0.553241      0.094519   
2    Wrong-Paramedic7489                0.372758      0.228668   
3      renaissancetrader                0.482290      0.036122   
4     groupthinkhivemind                0.378224      0.110411   
..                   ...                     ...           ...   
366         Snookcatcher                0.528110      0.108619   
367        AShipChandler                0.710303      0.148853   
368           EZforme885                0.648270      0.044154   
369        Shlimeeeeeeee                0.720600      0.050850   
370        user_name1983                0.682840      0.059107   

    avg_reply_speed avg_comment_time_interval  \
0   0 days 07:36:47           0 days 15:40:23   
1   0 days 07:40:44           0 days 09:14:15   
2   0 days 11:06:22           0 days 12:00:42   
3   0 days 

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [18:27<00:00, 36.92s/it]


                username  avg_comment_similarity  avg_toxicity  \
0             bichonista                0.652058      0.066850   
1                 dunnkw                0.646525      0.131262   
2             kingmoobot                0.516335      0.183417   
3           BloopityBlue                0.710963      0.153116   
4              Alemaya34                0.531396      0.008200   
5   Realistic-Demand-615                0.449127      0.193052   
6         pipsdontsqueak                0.566625      0.018835   
7              honkballs                0.613111      0.093612   
8          Standgeblasen                0.527708      0.158313   
9               backupJM                0.734480      0.000715   
10  ThrowThemUnderTheBus                0.313694      0.219269   
11           ChairOwn118                0.602454      0.067811   
12               baquish                0.339246      0.122184   
13    PreviouslyBannedXD                0.703062      0.048093   
14        

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [4:04:09<00:00, 29.30s/it]


                username  avg_comment_similarity  avg_toxicity  \
0             bichonista                0.652058      0.066850   
1    PeopleCanBeThisDumb                0.329302      0.067347   
2            lordatomosk                0.406619      0.191287   
3    ResponsibleHall9713                0.459217      0.281489   
4           TubularStars                0.230988      0.262676   
..                   ...                     ...           ...   
376       yeahiliketoast                0.552359      0.016535   
377               MSR209                0.471159      0.112537   
378              Go_getr                0.298545      0.069205   
379               Xipimp                0.612092      0.301041   
380          SpaceGazebo                0.614977      0.111339   

    avg_reply_speed avg_comment_time_interval  \
0   0 days 04:11:58           0 days 23:45:30   
1   6 days 02:28:11           1 days 21:58:26   
2   0 days 05:28:42           0 days 11:38:34   
3   0 days 

100%|██████████████████████████████████████████████████████████████████████████████| 326/326 [2:05:53<00:00, 23.17s/it]


                 username  avg_comment_similarity  avg_toxicity  \
0    uAHlOCyaPQMLorMgqrwL                0.520445      0.007448   
1          PsychLegalMind                0.851062      0.004064   
2           dishonestdick                0.529845      0.125218   
3           BCSWowbagger2                0.770747      0.033683   
4               tyboxer87                0.752842      0.067452   
..                    ...                     ...           ...   
210         roastedoolong                0.835629      0.359948   
211           Ottomatik80                0.698707      0.185573   
212       Savings_Mix6280                0.607552      0.018206   
213         thehildabeast                0.678181      0.141192   
214               meltbox                0.746530      0.001429   

    avg_reply_speed avg_comment_time_interval  \
0   1 days 11:07:47           0 days 14:21:58   
1   0 days 03:34:59           0 days 10:27:04   
2   0 days 04:02:38           0 days 01:59:58   

  time_data = pd.Series(time_intervals)
  avg_time_diff = (time_data.sum()/len(time_data)).round('1s')
 96%|██████████████████████████████████████████████████████████████████████████▉   | 480/500 [3:11:06<02:30,  7.51s/it]

'str' object cannot be interpreted as an integer


100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [3:21:16<00:00, 24.15s/it]

               username  avg_comment_similarity  avg_toxicity avg_reply_speed  \
0         extantsextant                0.465533      0.001328 0 days 03:38:29   
1    TheLincolnMemorial                0.567084      0.005383 0 days 05:34:12   
2            magnabonzo                0.597560      0.004351 0 days 06:43:50   
3      Kooky-Wafer-6032                0.616092      0.080925 0 days 07:30:45   
4               xp19375                0.685226      0.000786 0 days 07:43:09   
..                  ...                     ...           ...             ...   
343               fjonk                0.636772      0.030766 0 days 01:30:19   
344         justins_dad                0.497913      0.011527 0 days 04:21:58   
345           Hats_back                0.699038      0.190526 0 days 03:42:00   
346         Zoomwafflez                0.615047      0.148695 0 days 03:34:39   
347           WillyPete                0.292563      0.003467 0 days 05:31:35   

    avg_comment_time_interv


