In [1]:
import psycopg2
import pandas as pd
from config import config
from matplotlib import pylab as plt
import networkx as nx

%matplotlib inline


In [2]:
conn = None
try:
    # read connection parameters
    params = config()

    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    conn = psycopg2.connect(**params)

    # create a cursor
    cur = conn.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    cur.execute('SELECT version()')

    # display the PostgreSQL database server version
    db_version = cur.fetchone()
    print(db_version)

    # close the communication with the PostgreSQL
    cur.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 10.3 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 7.3.0, 64-bit',)


In [3]:
tweets = pd.read_sql_query("SELECT * FROM tweets_info;", conn, parse_dates=['created_at'] )
print("Number of Tweets: %s" %len(tweets))

Number of Tweets: 286330


In [4]:
tweets.head()

Unnamed: 0,id,user_id,text,created_at,source,lang,truncated,is_retweet,retweet_id,is_quote,...,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,hashtags,user_mentions,number_of_urls,extracted
0,951624637900820481,1658019493,Ihr huensöhne,2018-01-12 01:19:28,"<a href=""http://twitter.com/download/android"" ...",de,False,False,,False,...,0,0,0,0,False,False,,,0,False
1,951624793098514433,2394270660,#homo #kostenlos Teenie wird anal zerfetzt (22...,2018-01-12 01:20:05,"<a href=""http://serviporno.com"" rel=""nofollow""...",de,False,False,,False,...,0,0,0,0,False,False,homo kostenlos,,1,False
2,951624818239180803,3414086001,RT @HoerstelC: #CIA-#Chef #Pompeo dummer #Poli...,2018-01-12 01:20:11,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",de,False,True,9.504821e+17,False,...,0,0,0,0,False,False,CIA Chef Pompeo Politclown FoxNews Iraner Demo...,463758197.0,0,False
3,951624847616077824,2936727814,"Es gibt einen neuen Deal: (1x, 2x oder 3x Yvel...",2018-01-12 01:20:18,"<a href=""https://Premium-Lizenz.de/"" rel=""nofo...",de,True,False,,False,...,0,0,0,0,False,False,,,0,False
4,951623218527395841,3615347356,Die Nationalitätten Feststellung könnte sehr s...,2018-01-12 01:13:50,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",de,True,False,,False,...,0,0,1,2,False,False,,,1,True


In [5]:
#hashtags = pd.DataFrame(columns=['created_at', 'hashtag', 'id'])
entry_list = []

for index, row in tweets.iterrows():
    if row['hashtags'] != None:
        hashtag_list = row['hashtags'].split(" ")
        for hashtag in hashtag_list:
            entry_list.append({'created_at': row['created_at'], 
                               'hashtag': hashtag,  
                               'id': row['id'],
                               'user_id': row['user_id']})
entry_list
hashtags = pd.DataFrame(data=entry_list, columns=['created_at', 'hashtag', 'id', 'user_id'])
hashtags.set_index('created_at')
hashtags.head()

Unnamed: 0,created_at,hashtag,id,user_id
0,2018-01-12 01:20:05,homo,951624793098514433,2394270660
1,2018-01-12 01:20:05,kostenlos,951624793098514433,2394270660
2,2018-01-12 01:20:11,CIA,951624818239180803,3414086001
3,2018-01-12 01:20:11,Chef,951624818239180803,3414086001
4,2018-01-12 01:20:11,Pompeo,951624818239180803,3414086001


In [31]:
G = nx.Graph()
for index, row in hashtags.iterrows():
    G.add_node(row['hashtag'])
    #print(row['hashtag'])
    for idx, to_row in hashtags[hashtags['id'] == row['id']].iterrows():
        G.add_edge(row['hashtag'], to_row['hashtag'])

In [None]:
nx.draw(G)

In [30]:
print(list(nx.find_cliques(G)))

[['bkagjlad', 'Aurora', 'ersten', 'Schwanz', 'Anny', 'Teen', 'und'], ['allein', 'Pompeo', 'Chef', 'CIA', 'Iraner', 'angefangen', 'Grotesk', 'FoxNews', 'Politclown', 'Demonstrationen'], ['von', 'Porn'], ['kostenlos', 'homo'], ['CastropRauxel', 'Porn'], ['die', 'Porn'], ['sie', 'Porn'], ['Porn', 'verführt']]
