In [1]:
# We import all necessary libraries.
from pathlib import Path
import ujson
import glob
import networkx as nx
import matplotlib.pyplot as plt


GENERACIÓN DE LOS GRAFOS

In [2]:
# Function that generates a graph and returns it. Additionally it prints some statistics about the created graph.

def create_graph(folder):
    
    # Counters for the type of tweet and errors.
    retweet_count = 0
    reply_count = 0
    pure_quote_count = 0
    just_ment_count = 0
    simple_count = 0
    errors = 0
    # We create a directed graph.
    twitter_dg = nx.DiGraph()
    
    for file in Path(folder).glob('*.json'):
        # For each line of the file.
        for line in open(file, 'r', encoding="utf8"):
            # Skip empty lines.
            if not line.strip (): 
                continue
            # If the line contains a tweet.
            if line.startswith('{"created_at":') or line.startswith('{"contributors":'):
                # The origin node is the user that has published a tweet.
                origin = ujson.loads(line)['user']['screen_name']
            
                # If the tweet is a retweet:
                if 'retweeted_status' in line:
                    try:
                        # Update the counter.
                        retweet_count += 1
                        # We create a list that will contain the user names that receive the interaction.
                        destiny = []
                        # The retweeted user is the first of the list.
                        retweeted_user = ujson.loads(line)['retweeted_status']['user']['screen_name']
                        destiny.append(retweeted_user)
                        # We check if the origin user added extra destiny users by adding mentions to different users.
                        tweet_mentions = ujson.loads(line)['entities']['user_mentions']
                        if len(tweet_mentions) > 0:
                            for mention in tweet_mentions:
                                mentioned_user = mention['screen_name']
                                if mentioned_user != retweeted_user:
                                    destiny.append(mentioned_user)

                        # We add edges between the origin and destiny nodes, updating their weights in case they exist.           
                        for i in destiny:
                            if twitter_dg.has_edge(origin, i):
                                twitter_dg[origin][i]['weight'] += 1
                            else:
                                twitter_dg.add_edge(origin, i, weight = 1)
                    except:
                        errors += 1
                        pass
                        
                # If the tweet is a reply:
                elif ujson.loads(line)['in_reply_to_user_id']:
                    try:
                        reply_count += 1
                        destiny = []
                        replied_user = ujson.loads(line)['in_reply_to_screen_name']
                        destiny.append(replied_user)
                        tweet_mentions = ujson.loads(line)['entities']['user_mentions']
                        if len(tweet_mentions) > 0:
                            for mention in tweet_mentions:
                                mentioned_user = mention['screen_name']
                                if mentioned_user != replied_user:
                                    destiny.append(mentioned_user)
                        for i in destiny:
                            if twitter_dg.has_edge(origin, i):
                                twitter_dg[origin][i]['weight'] += 1
                            else:
                                twitter_dg.add_edge(origin, i, weight = 1)
                    except:
                        errors += 1
                        pass
                                
                # If the tweet is a "pure quote":
                elif 'quoted_status' in line:
                    try:
                        pure_quote_count += 1
                        destiny = []
                        quoted_user = ujson.loads(line)['quoted_status']['user']['screen_name']
                        destiny.append(quoted_user)
                        tweet_mentions = ujson.loads(line)['entities']['user_mentions']
                        if len(tweet_mentions) > 0:
                            for mention in tweet_mentions:
                                mentioned_user = mention['screen_name']
                                if mentioned_user != quoted_user:
                                    destiny.append(mentioned_user)
                        for i in destiny:
                            if twitter_dg.has_edge(origin, i):
                                twitter_dg[origin][i]['weight'] += 1
                            else:
                                twitter_dg.add_edge(origin, i, weight = 1)
                    except:
                        errors += 1
                        pass
                    
                # If the tweet is a "pure mention":
                elif len(ujson.loads(line)['entities']['user_mentions']) > 0:
                    try:
                        just_ment_count += 1
                        destiny = []
                        tweet_mentions = ujson.loads(line)['entities']['user_mentions']
                        for mention in tweet_mentions:
                            mentioned_user = mention['screen_name']
                            destiny.append(mentioned_user)
                        for i in destiny:
                            if twitter_dg.has_edge(origin, i):
                                twitter_dg[origin][i]['weight'] += 1
                            else:
                                twitter_dg.add_edge(origin, i, weight = 1)
                    except:
                        errors += 1
                        pass
                    
                # The tweet is simple.    
                else:
                    simple_count += 1
                
    
    total_tweets = retweet_count + reply_count + pure_quote_count + just_ment_count + simple_count
    
    print("ESTADÍSTICAS DEL GRAFO OBTENIDO:\n")

    print('Número total de tweets procesados: {}\n'.format(total_tweets))

    print("{} tweets son del tipo 'retweet', un {:.2f}% del total.".format(retweet_count, retweet_count/total_tweets*100))        
    print("{} tweets son del tipo 'reply', un {:.2f}% del total.".format(reply_count, reply_count/total_tweets*100))        
    print("{} tweets son del tipo 'cita pura', un {:.2f}% del total.".format(pure_quote_count, pure_quote_count/total_tweets*100))        
    print("{} tweets son del tipo 'mención pura', un {:.2f}% del total.".format(just_ment_count, just_ment_count/total_tweets*100))        
    print("{} tweets son del tipo 'simple', un {:.2f}% del total.\n".format(simple_count, simple_count/total_tweets*100))        
    print("Durante el procesado de los tweets ha habido {} errores.\n".format(errors))
    
    print("El grafo resultante tiene un total de {} nodos y {} aristas\n".format(twitter_dg.number_of_nodes(), twitter_dg.number_of_edges()))
    
    return twitter_dg


In [3]:
# Input folders.
folder_pre_8M = r'C:\UOC\TFM\Procesado_tweets\03_Gen_grafos\Input\6_7_M'
folder_8M = r'C:\UOC\TFM\Procesado_tweets\03_Gen_grafos\Input\8M'
folder_post_8M = r'C:\UOC\TFM\Procesado_tweets\03_Gen_grafos\Input\9_10_M'

In [5]:
# Generation of the 6M and 7M graph.
graph_pre_8M = create_graph(folder_pre_8M)

ESTADÍSTICAS DEL GRAFO OBTENIDO:

Número total de tweets procesados: 167738

139240 tweets son del tipo 'retweet', un 83.01% del total.
3524 tweets son del tipo 'reply', un 2.10% del total.
3244 tweets son del tipo 'cita pura', un 1.93% del total.
4493 tweets son del tipo 'mención pura', un 2.68% del total.
17237 tweets son del tipo 'simple', un 10.28% del total.

Durante el procesado de los tweets ha habido 0 errores.

El grafo resultante tiene un total de 102716 nodos y 174973 aristas



In [4]:
# Generation of the 8M graph.
graph_8M = create_graph(folder_8M)

ESTADÍSTICAS DEL GRAFO OBTENIDO:

Número total de tweets procesados: 951821

783855 tweets son del tipo 'retweet', un 82.35% del total.
15784 tweets son del tipo 'reply', un 1.66% del total.
28409 tweets son del tipo 'cita pura', un 2.98% del total.
11163 tweets son del tipo 'mención pura', un 1.17% del total.
112610 tweets son del tipo 'simple', un 11.83% del total.

Durante el procesado de los tweets ha habido 1 errores.

El grafo resultante tiene un total de 472376 nodos y 880375 aristas



In [6]:
# Generation of the 9M and 10M graph.
graph_post_8M = create_graph(folder_post_8M)

ESTADÍSTICAS DEL GRAFO OBTENIDO:

Número total de tweets procesados: 482099

438090 tweets son del tipo 'retweet', un 90.87% del total.
5147 tweets son del tipo 'reply', un 1.07% del total.
3977 tweets son del tipo 'cita pura', un 0.82% del total.
3570 tweets son del tipo 'mención pura', un 0.74% del total.
31315 tweets son del tipo 'simple', un 6.50% del total.

Durante el procesado de los tweets ha habido 0 errores.

El grafo resultante tiene un total de 292409 nodos y 475588 aristas



LIMPIEZA DE LOS GRAFOS

In [7]:
### CLEANING OF THE 6M AND 7M GRAPH
print("LIMPIEZA DEL GRAFO ANTERIOR AL 8M:\n")
# Check how many loops are present in the graph.
print("El grafo tiene {} loops.".format(nx.number_of_selfloops(graph_pre_8M)))
# Remove all loops
graph_pre_8M.remove_edges_from(nx.selfloop_edges(graph_pre_8M))
# Check that all loops have been removed succesfully.
print("Después de la limpieza el grafo tiene {} loops.\n".format(nx.number_of_selfloops(graph_pre_8M)))
# Check how many nodes and edges has the new graph.
print("El grafo resultante tiene un total de {} nodos y {} aristas\n".format(graph_pre_8M.number_of_nodes(), graph_pre_8M.number_of_edges()))


LIMPIEZA DEL GRAFO ANTERIOR AL 8M:

El grafo tiene 3171 loops.
Después de la limpieza el grafo tiene 0 loops.

El grafo resultante tiene un total de 102716 nodos y 171802 aristas



In [8]:
### CLEANING OF THE 8M GRAPH
print("LIMPIEZA DEL GRAFO DEL 8M:\n")
# Check how many loops are present in the graph.
print("El grafo tiene {} loops.".format(nx.number_of_selfloops(graph_8M)))
# Remove all loops
graph_8M.remove_edges_from(nx.selfloop_edges(graph_8M))
# Check that all loops have been removed succesfully.
print("Después de la limpieza el grafo tiene {} loops.\n".format(nx.number_of_selfloops(graph_8M)))
# Check how many nodes and edges has the new graph.
print("El grafo resultante tiene un total de {} nodos y {} aristas\n".format(graph_8M.number_of_nodes(), graph_8M.number_of_edges()))


LIMPIEZA DEL GRAFO DEL 8M:

El grafo tiene 11139 loops.
Después de la limpieza el grafo tiene 0 loops.

El grafo resultante tiene un total de 472376 nodos y 869236 aristas



In [9]:
### CLEANING OF THE 9M AND 10M GRAPH
print("LIMPIEZA DEL GRAFO POSTERIOR AL 8M:\n")
# Check how many loops are present in the graph.
print("El grafo tiene {} loops.".format(nx.number_of_selfloops(graph_post_8M)))
# Remove all loops
graph_post_8M.remove_edges_from(nx.selfloop_edges(graph_post_8M))
# Check that all loops have been removed succesfully.
print("Después de la limpieza el grafo tiene {} loops.\n".format(nx.number_of_selfloops(graph_post_8M)))
# Check how many nodes and edges has the new graph.
print("El grafo resultante tiene un total de {} nodos y {} aristas\n".format(graph_post_8M.number_of_nodes(), graph_post_8M.number_of_edges()))


LIMPIEZA DEL GRAFO POSTERIOR AL 8M:

El grafo tiene 3610 loops.
Después de la limpieza el grafo tiene 0 loops.

El grafo resultante tiene un total de 292409 nodos y 471978 aristas



EXPORTACIÓN DE LOS GRAFOS AL FORMATO GRAPHML

In [10]:
# Export all graphs to .graphml format.
nx.write_graphml(graph_pre_8M, r'C:\UOC\TFM\Procesado_tweets\03_Gen_grafos\Output\graph_pre_8M.graphml')
nx.write_graphml(graph_8M, r'C:\UOC\TFM\Procesado_tweets\03_Gen_grafos\Output\graph_8M.graphml')
nx.write_graphml(graph_post_8M, r'C:\UOC\TFM\Procesado_tweets\03_Gen_grafos\Output\graph_post_8M.graphml')