In [1]:
from pymongo import MongoClient
import numpy as np
import pandas as pd

from twitter_api_functions import tweets_db_to_df

In [2]:
# open MongoDB daemon and connect with pymongo

#!mongod
client = MongoClient()
client.list_database_names()

['admin', 'books', 'config', 'local', 'outings', 'twitter']

In [3]:
twitter = client.twitter
twitter.list_collection_names()

['tweets_2019_large', 'tweets_all_years']

In [4]:
all_years = twitter.tweets_all_years
large_2019 = twitter.tweets_2019_large

print(all_years.count_documents({}))  # number of tweets in collection
print(large_2019.count_documents({}))

5460
23864


In [5]:
# display ID, tweet text, year, and location for first 5 2007 tweets
cursor = all_years.find({'year': 2007}, {'_id': 0, 'id': 1, 'text': 1, 'year': 1, 'user.location': 1})
list(cursor.limit(5))

[{'id': 544845252,
  'text': 'Anyone involved in artificial intelligence work? Specifically, anyone know of any AI system that can RECOGNISE puns in natural language?',
  'user': {'location': 'Malham Close, Crawley, West Su'},
  'year': 2007},
 {'id': 532130162,
  'text': '読書中 Paradigms of Artificial Intelligence Programming: Case Studies in Common Lisp',
  'user': {'location': 'JP (Japan, Japan)'},
  'year': 2007},
 {'id': 531926162,
  'text': 'Update on geekgirl LOVE AND SEX WITH ROBOTS: If advances in artificial intelligence cont.. http://tinyurl.com/2vykx7',
  'user': {'location': 'Melbourne'},
  'year': 2007},
 {'id': 524153882,
  'text': '"A.I. Artificial Intelligence" a Speilberg movie based on a Brian Aldiss  short story is very creepy.  Like it mo\' better than "I Robot."',
  'user': {'location': 'Barsoom!'},
  'year': 2007},
 {'id': 523822582,
  'text': 'Paradigms of Artificial Intelligence Programming を注文した♪',
  'user': {'location': 'JP (Japan, Japan)'},
  'year': 2007}]

In [6]:
# show first 5 2019 tweets (truncated and full text)
cursor = all_years.find({'year': 2019}, {'_id': 0, 'id': 1, 'text': 1, 'year': 1, 'user.location': 1, 
                                         'retweeted_status.extended_tweet.full_text': 1})
list(cursor.limit(5))

[{'id': 1161789559715196928,
  'text': 'RT @rising_serpent: Once you combine this gigantic amorphous mass of the human gene code with artificial intelligence programming, the resu…',
  'user': {'location': None},
  'retweeted_status': {'extended_tweet': {'full_text': "Once you combine this gigantic amorphous mass of the human gene code with artificial intelligence programming, the results are far beyond the capacity of my meager imagination.\nI've ranted enough. Gotta go earn a living so that the government can give my money away.\n-Fin"}},
  'year': 2019},
 {'id': 1161789531328319488,
  'text': 'RT @UiPath: Do you believe in the power of #AI education? Then vote to bring UiPath panel to #SXSWEDU 2020 and together with @ByteBackDC, @…',
  'user': {'location': None},
  'retweeted_status': {'extended_tweet': {'full_text': 'Do you believe in the power of #AI education? Then vote to bring UiPath panel to #SXSWEDU 2020 and together with @ByteBackDC, @EdSurge and @PerScholas we’ll address th

In [7]:
# only 15 tweets in 'all_years' collection have geocodes
cursor = all_years.find({'geo': {'$ne': None}}, {'_id': 0, 'geo': 1, 'text': 1})
cursor.count()

  This is separate from the ipykernel package so we can avoid doing imports until


15

In [8]:
# query 'all_years' tweets from database and compile in DataFrame
all_years_df = tweets_db_to_df(all_years)
print(all_years_df.shape)
all_years_df.head()

(5460, 4)


Unnamed: 0,ID,location,text,year
0,544845252,"Malham Close, Crawley, West Su",Anyone involved in artificial intelligence wor...,2007
1,532130162,"JP (Japan, Japan)",読書中 Paradigms of Artificial Intelligence Progr...,2007
2,531926162,Melbourne,Update on geekgirl LOVE AND SEX WITH ROBOTS: I...,2007
3,524153882,Barsoom!,"""A.I. Artificial Intelligence"" a Speilberg mov...",2007
4,523822582,"JP (Japan, Japan)",Paradigms of Artificial Intelligence Programmi...,2007


In [9]:
# text for first 2019 tweet
all_years_df.loc[all_years_df['year'] == 2019].reset_index()['text'][0]

"Once you combine this gigantic amorphous mass of the human gene code with artificial intelligence programming, the results are far beyond the capacity of my meager imagination.\nI've ranted enough. Gotta go earn a living so that the government can give my money away.\n-Fin"

In [10]:
# query 'large_2019' tweets from database and compile in DataFrame
large_2019_df = tweets_db_to_df(large_2019)
print(large_2019_df.shape)
large_2019_df.head()

(23864, 4)


Unnamed: 0,ID,location,text,year
0,1161789559715196928,,Once you combine this gigantic amorphous mass ...,2019
1,1161789531328319488,,Do you believe in the power of #AI education? ...,2019
2,1161789508771336195,"Bordeaux, France",RT MikeQuindazzi: 7 #AI terms in the #Futureof...,2019
3,1161789506426671104,"Santiago, Chile",What if technology could help you be anywhere ...,2019
4,1161789505889820672,"Bordeaux, France",RT MikeQuindazzi: #BigData sets unlocking $45 ...,2019


In [122]:
# save DataFrames
large_2019_df.to_pickle('tweets_2019_large_df_reduced.pickle')
all_years_df.to_pickle('tweets_all_years_df_reduced.pickle')