In [1]:
import networkx as nx
import os, sys
import regex
import random
from collections import Counter
import timeit

In [3]:
rt_patt = regex.compile(r'RT ')
mention_patt = regex.compile(r'@\w+')
hashtag_patt = regex.compile(r'#\w+')
url_patt = regex.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
emoji_patt = regex.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
slash_patt = regex.compile(r'\/')
multispace_patt = regex.compile('\u0020{2,}')
cutoff_patt = regex.compile(r'\w+…')
whitespace_patt = regex.compile(r'\n{2,}')
digit_patt = regex.compile(r'\d+')
ws_patt = regex.compile(r'[\.\,\:\;\-\?\!\"\'\(\)…\*]',regex.I)

In [4]:
def normalize_text(line):
    no_urls = regex.sub(url_patt,'',line)
    no_rt = regex.sub(rt_patt,'',no_urls)
    no_slash = regex.sub(slash_patt,' ',no_rt)
    no_ats = regex.sub(mention_patt,'',no_slash)
    no_tags = regex.sub(hashtag_patt,'',no_ats)
    no_emoji = regex.sub(emoji_patt,'',no_tags)
    no_multispace = regex.sub(multispace_patt,' ',no_emoji)
    no_cutoff = regex.sub(cutoff_patt,'',no_multispace)
    no_digits = regex.sub(digit_patt,'',no_cutoff)
    nows_line = regex.sub(ws_patt,'',no_digits)
    lower_line = nows_line.lower()
    clean_text = regex.sub(whitespace_patt,' ',lower_line)
    
    return clean_text

In [5]:
def create_edgelist_from_userhits(user_hits):
    """Takes a list of adjectives/adverbs and returns an edgelist e.g.
        [('klar', 'fett'),
     ('klar', 'klar'),
     ('klar', 'absolut'),
     ('fett', 'absolut'),
     ('klar', 'fett'),
     ('klar', 'absolut')]
    """
    user_hits_edgelist = []
        
    for source in user_hits:
        myindex = user_hits.index(source)
        newlist = user_hits[:myindex]+user_hits[myindex+1:]   #make a new temp list without the person in it
        for item in newlist:
            if source != item: # we don't want self-references e.g. ("god","god")
                mytuple = (source, item)
                backtuple = (item, source)
                if backtuple not in set(user_hits_edgelist): #remove any reversed duplicates
                    user_hits_edgelist.append(mytuple)
    return user_hits_edgelist

In [6]:
def create_edgelist(G, files, graph_name, run):
    
    G = G
    files = files
    graph_name = graph_name
    
    path = '/Volumes/mos_storage/data/totalUsertexts/'
    plus_300_lines_files = 1
    analyzed_files = 1
    
    word_count = Counter()
    tot_word_count = 0
    tot_start = timeit.default_timer()
    
    for index, fname in enumerate(files):
        start_time = timeit.default_timer()
        f = open(path + fname)
        counter = Counter()
        user_hits = []
        edgelist = []
    
        # only analyze large files
        if len(f.readlines()) > 300:
            plus_300_lines_files += 1
            f.close()
            original_txt = open(path + fname).read()
            clean_txt = normalize_text(original_txt)
    
            # Open the adjectives/adverbs file again since the iterator is exhausted since before
            adjs = open("./interesting_adverbs-adjectives")
            
            for adj in adjs:
                the_string = "\\b" + adj.rstrip() + "\\b"
                patt = regex.compile(the_string)
                match = patt.findall(clean_txt)
                user_hits.extend(match)
                word_count.update({adj.rstrip():len(match)})
            
            if len(user_hits) > 0:
                tokens = clean_txt.split()
            
                tot_word_count += len(tokens)
            
                counter.update(user_hits)
            
                edgelist = create_edgelist_from_userhits(list(counter.keys()))

                for source, target in edgelist:
                
                    source_degree = counter[source]
                    target_degree = counter[target]
                    # there can only be as many edges as the lesser count of either source or target
                    weight = min(source_degree,target_degree)
                
                    try:
                        if nx.has_path(G,source,target):
                            #print("Edge exists from {} to {}. Old w {} new w {}".format(source, target,G[source][target]["weight"],G[source][target]["weight"] + weight))
                            G[source][target]["weight"] += weight
                
                    except Exception as e:
                        # make sure nodes exists
                        G.add_node(source)
                        G.add_node(target)
                        G[source][target]["weight"] = weight
                
                    G.add_edge(source, target)
                
                elapsed = timeit.default_timer() - start_time
                print("After {} - added {} user_hits and {} edges for file {}".format(elapsed,len(user_hits),len(edgelist),fname), end="\r")
            else:
                elapsed = timeit.default_timer() - start_time
                print("After {} - {} didn't match any adjectives or adverbs".format(elapsed,fname),flush=True, end="\r")
            
        else:
            elapsed = timeit.default_timer() - start_time
            print("After {} - {} didn't contain > 300 clean lines".format(elapsed, fname),flush=True, end="\r")
        
        f.close()
        elapsed = timeit.default_timer() - start_time
        print("After {} - File no {} of {}".format(elapsed, index, len(files)),end="\r")
    
    for node in G.nodes_iter():
        # see http://stackoverflow.com/a/24685791
        G.node[node]["word_count"] = word_count[node]

    tot_elapsed = timeit.default_timer() - tot_start
    print("After {} run {} - Analyzed {} +300 line files containing {} total clean tokens".format(tot_elapsed,run, analyzed_files, tot_word_count))
    
    path = "/Users/mos/twitterdb/graphs/"
    nx.write_graphml(G,path+graph_name+".graphml" )
    print("Saved graph file {}".format(path+graph_name+".graphml"))