# Query tweets
Load in tweets to MongoDB database  
*First start MongoDB service:*  
`mongod --config /usr/local/etc/mongod.conf`  
`brew services start mongodb-community`  

In [1]:
# Load dependencies
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
import os
import re
import pandas as pd

In [2]:
# Configuration
# Location MongoDB:
mongo_host = None
# Name collection
client_name = "fundamentals"

In [3]:
# Create connection
try:
    client = MongoClient(mongo_host)
    client.admin.command('ismaster')
    db = client[client_name] 
    twitter_db = db.twitter
    
except ConnectionFailure:
    print("Connection to MongoDB server could not be established")
    exit()

In [8]:
# Test connection by counting
print("Amount of tweets:",twitter_db.count_documents({}))

Amount of tweets: 657307


In [9]:
# Create pickle of DataFrame of all us tweets with added state locations
print("Number of US country tweets", twitter_db.count_documents(filter = 
 {"place.country_code" : "US"}))
total_df = pd.DataFrame(twitter_db.find(filter = {"place.country_code" : "US"}, projection =
                                       {"id": 1,"place.state": 1, "_id" : 0} ,limit = 0))

total_df.to_pickle("../state_id.pkl")

Number of US country tweets 593268


In [None]:
# Example MongoDB queries
# Distinct languages
twitter_db.distinct("lang")

# Extract data form bounding box
location = twitter_db.find(filter = {"place.country_code" : "US"}, projection = 
                           { "_id": 0 , "place.bounding_box.coordinates": 1}, limit = 5)
for obj in location:
    print(obj["place"]['bounding_box']['coordinates'])

In [41]:
test_df = pd.read_pickle("../datasets/state_id.pkl")

In [42]:
test_df["states"] = None
for row in test_df.itertuples():
    place = getattr(row,"place")
    test_df.at[row[0], "states"] = place.get("state")

In [45]:
test_df.drop("place", axis = 1)

Unnamed: 0,id,states
0,764039724818272256,Missouri
1,764039733076897792,Louisiana
2,764039769244348417,Missouri
3,764039849850482689,Maryland
4,764039917924069376,California
...,...,...
593263,775323190436438016,Florida
593264,775323208463503361,California
593265,775323214801018881,Florida
593266,775323236137508864,Alaska


In [46]:
test_df.to_pickle("../datasets/state_id.pkl")

In [9]:
names = ['realDonaldTrump', 'HillaryClinton']
print("Trump: "+str(twitter_db.count_documents({"entities.user_mentions.screen_name": names[0]})))
print("Clinton: "+str(twitter_db.count_documents({"entities.user_mentions.screen_name": names[1]})))
print("Both: "+str(twitter_db.count_documents({"$and":[{"entities.user_mentions.screen_name": names[0]},
                                                       {"entities.user_mentions.screen_name": names[1]}]})))

Trump: 325834
Clinton: 140740
Both: 28253
