In [1]:
import pandas as pd 
import json
from neo4j import GraphDatabase

In [2]:
def json_to_pd(filename):
    with open(filename) as f:
        data = json.load(f)
        df = pd.json_normalize(data)
        f.close()
    return df

# fetching and cleaning the data
# tweets
df_tweets_wayne = json_to_pd('twitter_files/tweets_wayne.json')
df_tweets_glenn = json_to_pd('twitter_files/tweets_glenn.json')
df_tweets_mariah = pd.read_csv('twitter_files/tweets_mariah.csv')
df_tweets_mariah.rename(columns={'like_count': 'public_metrics.like_count', 'retweet_count':'public_metrics.retweet_count','reply_count':'public_metrics.reply_count', 'quote_count':'public_metrics.quote_count'}, inplace =True)

# followers
df_followers_cyrus = json_to_pd('twitter_files/followers.json')
df_followers_mariah = pd.read_csv('twitter_files/followers_mariah.csv')
df_followers_mariah.rename(columns={'username': 'screen_name', 'following_count': 'friends_count'}, inplace=True)

#following 
df_following_elon = json_to_pd('twitter_files/following.json')

In [3]:
df_tweets_wayne = df_tweets_wayne[['id', 'text', 'public_metrics.retweet_count', 'public_metrics.reply_count', 'public_metrics.like_count', 'public_metrics.quote_count']]
df_tweets_glenn = df_tweets_glenn[['id', 'text', 'public_metrics.retweet_count', 'public_metrics.reply_count', 'public_metrics.like_count', 'public_metrics.quote_count']]
df_tweets_mariah = df_tweets_mariah[['id', 'text', 'public_metrics.retweet_count', 'public_metrics.reply_count', 'public_metrics.like_count', 'public_metrics.quote_count']]
df_tweets_elon = pd.concat([df_tweets_wayne, df_tweets_glenn, df_tweets_mariah], axis=0).drop_duplicates().reset_index(drop=True)

df_followers_cyrus = df_followers_cyrus[['id', 'screen_name', 'followers_count', 'friends_count']]
df_followers_mariah = df_followers_mariah[['id', 'screen_name', 'followers_count', 'friends_count']]
df_followers_elon = pd.concat([df_followers_cyrus, df_followers_mariah], axis=0).drop_duplicates().reset_index(drop=True)

df_following_elon = df_following_elon[['id', 'screen_name', 'followers_count', 'friends_count']]

In [4]:
# removing nan in case 
df_tweets_elon.dropna(inplace=True)
df_followers_elon.dropna(inplace=True)
df_followers_elon.dropna(inplace=True)

In [23]:
capped_followers =df_followers_elon.sample(n=100000)

In [24]:
df_nodes_of_persons = pd.concat([df_following_elon,capped_followers]).drop_duplicates().reset_index(drop=True)

In [25]:
df_nodes_of_persons

Unnamed: 0,id,screen_name,followers_count,friends_count
0,138882249,konstructivizm,518434,39402
1,467823431,RepJeffries,822078,1062
2,539965863,DuffelBlog,46648,18
3,93714983,csa_asc,357903,1275
4,1390762874809761793,DefiantLs,928782,68
...,...,...,...,...
100171,1598491306497654784,LeeTanden,0,4
100172,1598294283936735238,SydneyDetweile2,0,49
100173,1598374297944395782,Martin_twox,2,10
100174,1598436597510426650,Shyma97079228,0,32


In [12]:
# graph = GraphDatabase.driver("bolt://0.0.0.0:7687", auth=("neo4j", "test1234"), encrypted=False)
graph = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "test1234"), encrypted=False)

In [13]:
with graph.session() as session:
    session.run("MATCH (n) DETACH DELETE n")

In [26]:
def create_person(tx, id, screen_name, followers_count, friends_count):
    return tx.run("CREATE (a:Person {id:$id, screen_name:$screen_name, followers_count:$followers_count, friends_count:$friends_count}) "
                  "RETURN id(a)", id=id, screen_name=screen_name, followers_count=followers_count, friends_count=friends_count).single().value()

def create_tweet(tx, id, text, retweet_count, reply_count, like_count, quote_count):
    return tx.run("CREATE (a:Tweet {id:$id, text:$text, retweet_count:$retweet_count, reply_count:$reply_count, like_count:$like_count, quote_count:$quote_count}) "
                  "RETURN id(a)", id=id, text=text, retweet_count=retweet_count, reply_count=reply_count, like_count=like_count, quote_count=quote_count).single().value()

def create_follows(tx, id_a, id_b, screen_name_a, screen_name_b):
    return tx.run("MATCH (a:Person {id: $id_a, screen_name:$screen_name_a})"
                  "MATCH (b:Person {id: $id_b, screen_name: $screen_name_b})"
                  "CREATE (a)-[r:FOLLOWS]->(b)"
                  "RETURN type(r)", id_a=id_a, id_b=id_b, screen_name_a=screen_name_a, screen_name_b=screen_name_b).single().value()

def create_tweets(tx, id_p, id_t, screen_name, text):
    return tx.run("MATCH (a:Person {id: $id_p, screen_name:$screen_name})"
                  "MATCH (b:Tweet {id: $id_t, text:$text})"
                  "CREATE (a)-[r:TWEETS]->(b)"
                  "RETURN type(r)", id_p=id_p, id_t=id_t, screen_name=screen_name, text=text).single().value()

In [10]:
df_tweets_elon

Unnamed: 0,id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,1601894132573605888,My pronouns are Prosecute/Fauci,51399,26239,319923,8690
1,1601886758987104256,@murraymints82 Oh it is coming bigtime …,3534,1485,26821,545
2,1601885190824943617,Now things get spicy 🌶️,13336,9316,152363,1353
3,1601884478745374720,https://t.co/yhEOieaGER,26058,7392,187244,1999
4,1601883606753824768,@ParikPatelCFA 🤖 💀 🥂,300,276,7015,21
...,...,...,...,...,...,...
6830,1604486844867133442,Follow The World Cup on Twitter! https://t.co/...,5998,3465,64745,391
6831,1604477567800676352,"@neontaster Agreed, a high priority software u...",1429,1093,19086,121
6832,1604342060617695232,Hallelujah!! https://t.co/i2FyvXPIHO,15825,17110,193680,3274
6833,1604280647937396736,@hodgetwins 🧐 Much will come to light as Fau...,18686,5307,146649,1076


In [28]:
df_tweets_elon.drop_duplicates(inplace=True)

In [30]:
df_tweets_elon['public_metrics.like_count']

0       319923
1        26821
2       152363
3       187244
4         7015
         ...  
6830     64745
6831     19086
6832    193680
6833    146649
6834     98257
Name: public_metrics.like_count, Length: 6835, dtype: int64

In [27]:
# storing data in neo4j 
with graph.session() as session:
    # elon musk
    session.execute_write(create_person, 44196397, 'elonmusk', 127555398, 176)

    # tweets
    for idx, row in df_tweets_elon.iterrows():
        session.execute_write(create_tweet,row['id'], row['text'], row['public_metrics.retweet_count'], row['public_metrics.reply_count'], row['public_metrics.like_count'], row['public_metrics.quote_count'])  
        session.execute_write(create_tweets, 44196397, row['id'], 'elonmusk', row['text'])

    # nodes of Persons 
    for idx, row in df_nodes_of_persons.iterrows():
        session.execute_write(create_person, row['id'], row['screen_name'], row['followers_count'], row['friends_count'])

    # following 
    for idx, row in df_following_elon.iterrows():
        session.execute_write(create_follows, 44196397, row['id'],  'elonmusk', row['screen_name'])

    # followers 
    for idx, row in capped_followers.iterrows():
        session.execute_write(create_follows, row['id'], 44196397, row['screen_name'], 'elonmusk')

  warn("Expected a result with a single record, "


In [46]:
# Elon Musk's famous / controversial tweets 
with graph.session() as session:
    famous_tweets_by_like = session.run("MATCH (n:Tweet) WITH n ORDER BY n.like_count DESC RETURN n.text LIMIT 20").data()
    contro_tweets_by_reply = session.run("MATCH (n:Tweet) WITH n ORDER BY n.replt_count DESC RETURN n.text LIMIT 20").data()

In [48]:
famous_df = pd.DataFrame([dict(_) for _ in famous_tweets_by_like])
contro_df = pd.DataFrame([dict(_) for _ in contro_tweets_by_reply])

In [80]:
famous_df.drop_duplicates().reset_index()

Unnamed: 0,index,n.text
0,0,the bird is freed
1,2,Comedy is now legal on Twitter
2,4,https://t.co/rbwbsLA1ZG
3,6,https://t.co/kGncG7Hs3M
4,8,Twitter is ALIVE
5,10,https://t.co/G83vCrHHJf
6,12,Entering Twitter HQ – let that sink in! https:...
7,14,"Trash me all day, but it’ll cost $8"
8,16,I love when people complain about Twitter … on...
9,18,🇲🇦🇲🇦 Congrats Morocco!! 🇲🇦🇲🇦


In [81]:
print(famous_df['n.text'])

0                                     the bird is freed
2                        Comedy is now legal on Twitter
4                               https://t.co/rbwbsLA1ZG
6                               https://t.co/kGncG7Hs3M
8                                      Twitter is ALIVE
10                              https://t.co/G83vCrHHJf
12    Entering Twitter HQ – let that sink in! https:...
14                  Trash me all day, but it’ll cost $8
16    I love when people complain about Twitter … on...
18                         🇲🇦🇲🇦 Congrats Morocco!! 🇲🇦🇲🇦
19                      My pronouns are Prosecute/Fauci
Name: n.text, dtype: object


In [50]:
contro_df.drop_duplicates().reset_index()

Unnamed: 0,index,n.text
0,0,@murraymints82 Oh it is coming bigtime …
1,1,Now things get spicy 🌶️
2,2,https://t.co/yhEOieaGER
3,3,@ParikPatelCFA 🤖 💀 🥂
4,4,Twitter is speeding up
5,5,@AllanObare4 Yes
6,6,Twitter was Wormtongue to the World
7,7,Saruman … your staff is broken.\nhttps://t.co/...
8,8,RT @SpaceX: Deployment of ispace’s HAKUTO-R Mi...
9,9,@GailAlfarATX The bots/trolls generate fake ac...


In [84]:
print(contro_df['n.text'])

0              @murraymints82 Oh it is coming bigtime …
1                               Now things get spicy 🌶️
2                               https://t.co/yhEOieaGER
3                                  @ParikPatelCFA 🤖 💀 🥂
4                                Twitter is speeding up
5                                      @AllanObare4 Yes
6                   Twitter was Wormtongue to the World
7     Saruman … your staff is broken.\nhttps://t.co/...
8     RT @SpaceX: Deployment of ispace’s HAKUTO-R Mi...
9     @GailAlfarATX The bots/trolls generate fake ac...
10                         @F1Womble No later than noon
11              The bots are in for a surprise tomorrow
12                                        @nichegamer 💯
13                @stillgray Traceroute woke_mind_virus
14                           @Liz_Wheeler Good question
15                                        @hodgetwins 🎯
16    Deplatforming the President (cont.) https://t....
17            @Twitter And many other features t

In [64]:
# extracting JP's tweets to neo4j 
df_tweets_jp = json_to_pd('twitter_files/dr_jp_tweets.json')
df_tweets_jp = df_tweets_jp[['id', 'text', 'retweet_count']]

In [67]:
df_tweets_jp.dropna()

Unnamed: 0,id,text,retweet_count
0,1602966550226649089,@Football__Tweet Certainly @johanzammit. Last...,0
1,1601488667172012032,"Our book chapter ""#ML for Metabolomic Pathway ...",0
2,1600450851125878784,Christmas would not be the same without the #A...,1
3,1600110659076620288,"@dwejjaq_enormi if you have to explain it, it'...",0
4,1600045482830610432,RT @danvlla: First ever publication! 😁 Our pap...,3
...,...,...,...
1131,242690246955921408,Gangs of new England http://t.co/yAzSgcWn,0
1132,241277410849456128,Facebook analytics done well: http://t.co/IZD...,0
1133,240907264779776001,Just when you thought you were going to stop f...,0
1134,240578233081417728,Interesting insight about working for #stripe ...,0


In [65]:
# adding dr_jpe as a node 
with graph.session() as session:
    session.execute_write(create_person, 384337198, 'dr_jpe', 788, 2393)

In [69]:
with graph.session() as session:
    for idx, row in df_tweets_jp.iterrows():    
        session.execute_write(create_tweet,row['id'], row['text'], row['retweet_count'], 0, 0, 0)  
        session.execute_write(create_tweets, 384337198, row['id'], 'dr_jpe', row['text'])

  warn("Expected a result with a single record, "
