In [1]:
# Jupyter magic https://ipython.readthedocs.io/en/stable/interactive/magics.html
# %config IPCompleter.greedy = True
# %doctest_mode 
# %pprint

In [9]:
# %pip install --user --upgrade nltk tldextract tqdm ipywidgets jupyterlab    
!jupyter labextension install @jupyter-widgets/jupyterlab-manager
# !jupyter nbextension enable --py widgetsnbextension

An error occured.
ValueError: Please install nodejs 5+ and npm before continuing installation. nodejs may be installed using conda or directly from the nodejs website.
See the log file for details:  /tmp/jupyterlab-debug-dzzcv09v.log


In [3]:
    
import csv
import os
import shutil
import doctest
from pathlib import Path, PurePath

import nltk
import tldextract
import collections
import time

from tqdm.auto import tqdm, trange

for i in trange(5):
    time.sleep(0.1)


print(tldextract.extract("bbc.co.uk"))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


ExtractResult(subdomain='', domain='bbc', suffix='co.uk')


In [4]:
OUTPUT_DIR = Path("output/")
DATAPATH   = Path("ToAnalyse/")
GENRE_DICT = {
"Nation": ['nation', 'nationalist', 'country', 'people', 'Australia', 'Australian', 'patriots', 'patriotic', 'aussies', 'flag', 'anthem', 'immigration', 'multiculturalism', 'ANZAC', 'military', 'soldiers' ],
"Islam": ['muslim', 'Islam', 'Sharia', 'ISIS', 'mosque', 'Allah', 'Halal', 'Koran', 'infidel', 'akbar', 'burka', 'hijab', 'caliphate', 'jihad', 'jihadist', 'Islamisation', 'imem', 'mohammad', 'mecca', 'muzzie' ],
"Government and Politics": ['government', 'parliament', 'council', 'councillor', 'policy', 'politics', 'turnbull', 'shorten', 'Andrews', 'Hanson' ],
"Racial References": ['white', 'anglo', 'European', 'immigration', 'white genocide', 'replacement', 'great replacement', 'blacks', 'Africans', 'take over', 'invasion', 'breed', 'race', 'superior', '1488' ],
"Violence": ['crimes', 'criminal', 'thugs', 'bashing', 'gang', 'probation', 'sentencing', 'parole', 'police', 'Cronulla', 'robbery', 'police', 'jail', 'prison', 'punishment', 'courts', 'victims', 'violence', 'Christchurch', 'Tarrant', 'Brevik', 'El Paso', 'shoot', 'shooting', 'mass-shooting'],
"Sexuality and Gender": ['gender', 'transgender', 'sex', 'homosexual', 'sexual', 'paedophilia', 'gay', 'queer', 'lesbian', 'marriage', 'feminist', 'masculinity', 'sodomy']
}

In [5]:
try:
    nltk.data.find('tokenizers')
    print("Tokenizer found")
except:
    print("Tokenizer not found")
    nltk.download('all')

Tokenizer found


* Create folder in share “output”
* For all other folders in the share:
  * perform following analyses. ON each indvidual CSV AND ON all CSVs which exist in a subfolder of the current folder or within the current folder.
    * Name of our output (called priorname below): “output/all involved folder names without spaces+name of csv if for individual csv”
    * In priorname+”domain_summary.csv”
      * Extract all URLS from column labeled “Final Link” if exists, else “Link”
        * domain, Count frequency of domain. 
    * In priorname+”summary.csv”
      * Find earliest creation date
      * Find latest creation date
      * Sum Likes Comments Shares Love Wow Haha Sad Angry Thankful, Post Views, Total Views
      * Max Page Likes
    * In priorname+”activity_histogram.csv”
      * columns: created (truncated to day), count of posts on that day
    * Tokenise and lemmatise message+link text+description columns, 
      * Combine each tokenised message according to created date.
      * Remove stopwords (https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip) according to default english stopword list in NLTK (https://pythonspot.com/nltk-stop-words/)
      * Create a lexical dispersion plot according to each of the following sets of words named priorname+“genre.png”
      * Create a frequency list and output as priorname+frequency.csv
        * In the frequency list, if a word appears in the genre list, label it in an appropriate column
        * Columns: 
          * Lemmatised Word
          * Count
          * [Genres] (”True” if word appears in that genre) (Binary index)
      * Generate a word cloud for the above tokens as per https://vprusso.github.io/blog/2018/natural-language-processing-python-3/ named priorname+”wordcloud.png”



In [6]:
class analysis():
    def __init__(self, original_file_name=None):
        self.shared_domains_frequency = collections.OrderedDict()
        self.nltk_tokens = []
        self.max_page_likes = 0
        self.post_count = 0
        self.min_post_age = None
        self.max_post_age = None
        self.group = None
        self.region = None
        self.file = None
        self.original_file_name = original_file_name
        


def csv_to_dict(file):
    """Imports a CSV into a format we can read
    
    >>> file = "ToAnalyse/Australian Movement/Anti-Islam/2019-02-03-17-10-09-GMT-Historical-Report-Australian-Liberty-Alliance-1970-01-01--2019-02-03.csv"
    >>> csv_file = csv_to_dict(file)
    >>> csv_file[0].keys()
    odict_keys(['Page Name', 'User Name', 'Page Id', 'Page Likes at Posting', 'Created', 'Type', 'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Thankful', 'Video Share Status', 'Post Views', 'Total Views', 'Total Views for all Crossposts', 'URL', 'Message', 'Link', 'Final Link', 'Link Text', 'Description', 'Sponsor Id', 'Sponsor Name', 'Score'])
    >>> len(csv_file)
    2610
    >>> csv_file[1]['Message'][:9]
    'Bye Bye 🐣'
    """
    
    output = []
    with open(file, "r", newline="", encoding='utf-8-sig') as csvfile:
        dictreader = csv.DictReader(csvfile)
        for row in dictreader:
            output.append(row)
    return output
    
def extract_domain_count(csvdict):
    """In priorname+”domain_summary.csv”
        Extract all URLS from column labeled “Final Link” if exists, else “Link”
        Cols: domain, Count frequency of domain.
    
    >>> file = "ToAnalyse/Australian Movement/Anti-Islam/2019-02-03-17-10-09-GMT-Historical-Report-Australian-Liberty-Alliance-1970-01-01--2019-02-03.csv"
    >>> csv_file = csv_to_dict(file)
    >>> domains = extract_domain_count(csv_file)

    >>> domains['theaustralian.com.au']
    118

    
    """
    
    links = []
    domain_count = collections.OrderedDict()
    for row in csvdict:

        link = row.get('Final Link', row.get('Link', None))
        if link:
            tld = tldextract.extract(link)
            links.append("{}.{}".format(tld.domain, tld.suffix))
    for link in links:
        domain_count[link] = links.count(link)
    #https://stackoverflow.com/a/613218/263449 
    return collections.OrderedDict(sorted(domain_count.items(), key=lambda kv: kv[1], reverse=True))

def add_ordered_dict(ordered_a, ordered_b, reverse=True):
    """Add two ordered dicts together
    Remember, ordered dict are mutable
    >>> a = collections.OrderedDict({'a': 5, 'b': 10})
    >>> b = collections.OrderedDict({'c': 5, 'b': 10})
    >>> c = collections.OrderedDict({'c': 5, 'a': 10})
    
    >>> add_ordered_dict(a,b)
    OrderedDict([('b', 20), ('a', 5), ('c', 5)])
    
    >>> add_ordered_dict(c,add_ordered_dict(a,b))
    OrderedDict([('b', 20), ('a', 15), ('c', 10)])
    
    >>> add_ordered_dict(c,add_ordered_dict(a,b), reverse=False)
    OrderedDict([('c', 10), ('a', 15), ('b', 20)])
    """
    output = collections.OrderedDict(ordered_a)
    for row in ordered_b:
        if row in ordered_a:
            output[row] += ordered_b[row]
        else:
            output[row] = ordered_b[row]
    
    return collections.OrderedDict(sorted(output.items(), key=lambda kv: kv[1], reverse=reverse))
    
    
def analyse_csv(file):
    """Run all analysis on a CSV
    >>> file = PurePath("ToAnalyse/Australian Movement/Anti-Islam/2019-02-03-17-10-09-GMT-Historical-Report-Australian-Liberty-Alliance-1970-01-01--2019-02-03.csv")
    >>> output = analyse_csv(file)
    >>> output.original_file_name
    PurePosixPath('ToAnalyse/Australian Movement/Anti-Islam/2019-02-03-17-10-09-GMT-Historical-Report-Australian-Liberty-Alliance-1970-01-01--2019-02-03.csv')
    
    >>> output.file
    '2019-02-03-17-10-09-GMT-Historical-Report-Australian-Liberty-Alliance-1970-01-01--2019-02-03.csv'
    >>> output.group
    'Anti-Islam'
    
    >>> output.region
    'Australian Movement'
    
    >>> output.shared_domains_frequency['theaustralian.com.au']
    118
    """

    csv_file = csv_to_dict(file)
    analysed_file = analysis(file)
    
    analysed_file.file = file.parts[-1]
    analysed_file.group = file.parts[-2]
    analysed_file.region = file.parts[-3]
    
    domains = extract_domain_count(csv_file)
    analysed_file.shared_domains_frequency = domains
    
    return analysed_file
doctest.testmod()    


TestResults(failed=0, attempted=22)

In [7]:
# def resetOutput(output):
#     shutil.rmtree(output, ignore_errors=True)
#     os.makedirs(output)

try:
    os.makedirs(output)
except:
    print("Output dir exists, cleaning")
    for file in tqdm(OUTPUT_DIR.glob("**/*.csv")):
        print(file)
        #os.remove(file)

resetOutput(OUTPUT_DIR)

Output dir exists, cleaning


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




NameError: name 'resetOutput' is not defined

In [None]:
#doctest.testmod()

In [None]:
individual_csv_data = []
regions = {}

for file in tqdm_notebook(DATAPATH.glob("**/*.csv")):
    purepath = PurePath(file)
    
    region = purepath.parts[-3]
    group = purepath.parts[-2]
    filename = purepath.parts[-1]
    if region not in regions:
        regions.update({region:[]})
    if group not in regions[region]:
        regions[region].append(group)
    
    #print(file, region, group, filename)
    
    individual_csv_data.append(analyse_csv(file))
    
print(regions)

In [None]:
group_domains = {}
region_domains = {}

def write_domains(filename, domains):
    with open(OUTPUT_DIR / filename, "w", encoding="utf-8", newline='') as outcsv:
        csvwriter = csv.writer(outcsv)
        csvwriter.writerow(['domain', 'frequency_of_mention'])
        for row in domains:
            csvwriter.writerow([row, domains[row]])
            
for individual_analysis in individual_csv_data:
    filename = 'domains-{}-{}-{}'.format(individual_analysis.region, individual_analysis.group, individual_analysis.file)
    write_domains(filename, individual_analysis.shared_domains_frequency)
    
    
for region in regions:
    print(region)
    regiondomains = collections.OrderedDict()
    for group in regions[region]:
        print("\t", group)
        groupdomains = collections.OrderedDict()
        for individual_analysis in individual_csv_data:
            if individual_analysis.group == group:
                groupdomains = add_ordered_dict(groupdomains, individual_analysis.shared_domains_frequency)
            if individual_analysis.region == region:
                regiondomains = add_ordered_dict(regiondomains, individual_analysis.shared_domains_frequency)
        filename = 'domains-{}-{}.csv'.format(region, group)
        write_domains(filename, groupdomains)
    filename = 'domains-{}.csv'.format(region)
    write_domains(filename, regiondomains)