# MongoDB

> humongous: of an extremely large size. https://en.wiktionary.org/wiki/humongous



### Characteristics

- databases -> collections -> documents
- document = JSON-like (nested) dictionaries
- no schema required
- easy to use
- made for processing tons of data
- built-in support for distributed architecture (replication, sharding)

In [None]:
!pip install pymongo

In [16]:
import tweepy

In [3]:
import pymongo

In [4]:
# is my mongodb-container running?
!docker-compose ps

         Name                   Command           State            Ports        
--------------------------------------------------------------------------------
06_data_pipeline_etl_1   python ./src/app.py      Exit 0                        
06_data_pipeline_mongo   docker-entrypoint.sh     Up       0.0.0.0:27017->27017/
db_1                     mongod                            tcp,:::27017->27017/t
                                                           cp                   
06_data_pipeline_tweet   python ./src/app.py      Exit 0                        
_collector_1                                                                    


## Connect to the DB Server

In [5]:
# connect to the database inside the container
client = pymongo.MongoClient(host='localhost', port=27017)

# connect to the database from inside another container
# client = pymongo.MongoClient('mongodb', 27017)

### List databases

In [6]:
# list available databases
client.list_database_names()

['admin', 'config', 'local', 'spiced']

### List collections

In [7]:
# use the spiced database
db = client.tweets

In [8]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'tweets')

In [9]:
# list collections
db.list_collection_names()

[]

In [22]:
# count documents
db.tweets.count_documents({})

0

In [23]:
db.tweets.find_one({'name':'Malte'})

## Create

In [24]:
API_KEY = "AO9qiiu6FfwrIGll0hGu0cxeS"
API_SECRET = "q3FnJDzgV5vUiHt6d2hKSaLI2irAEV6juvzpywgixUr99CPQij"

In [25]:
def get_auth_handler():
    """
    Function for handling Twitter Authentication. See course material for 
    instructions on getting your own Twitter credentials.
    """
    auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
    return auth

In [26]:
def get_full_text(status):
    """Returns the full text of a (re)tweet"""
    try:
        return status.retweeted_status.full_text
    except AttributeError:  # Not a Retweet
        return status.full_text

In [27]:
if __name__ == '__main__':
    auth = get_auth_handler()
    api = tweepy.API(auth)

    # cursor = tweepy.Cursor(
    #     api.user_timeline,
    #     id='guardian',
    #     tweet_mode='extended'
    #  )
    cursor = tweepy.Cursor(
        api.search, 
        q="covid uk  from:guardian ", 
        tweet_mode="extended", 
        lang='en',
        result_type='recent'
    )


In [28]:
for status in cursor.items(50):
        tweet = {
            'text': get_full_text(status),
            'username': status.user.screen_name,
            'followers_count': status.user.followers_count
            }

In [33]:
# insert a document

doc = tweet


# If the document does not have an _id field one will be added automatically!
db.tweets.insert_one(doc)

<pymongo.results.InsertOneResult at 0x7f2a28340b80>

In [51]:
# insert several documents

docs = [...]

db.students.insert_many()

TypeError: insert_many() missing 1 required positional argument: 'documents'

## Read

In [34]:
# most recent document
db.tweets.find_one()

{'_id': ObjectId('6141c538bd001f2eabba39aa'),
 'text': "Giving booster shots before the world is vaccinated won't keep the UK safe from Covid | Charlotte Summers https://t.co/LdAGr4H9tj",
 'username': 'guardian',
 'followers_count': 9910359}

In [37]:
# filter
db.tweets.find_one({'username':'guardian'})

{'_id': ObjectId('6141c538bd001f2eabba39aa'),
 'text': "Giving booster shots before the world is vaccinated won't keep the UK safe from Covid | Charlotte Summers https://t.co/LdAGr4H9tj",
 'username': 'guardian',
 'followers_count': 9910359}

In [38]:
# find several documents
for doc in db.tweets.find():
    print(doc)

{'_id': ObjectId('6141c538bd001f2eabba39aa'), 'text': "Giving booster shots before the world is vaccinated won't keep the UK safe from Covid | Charlotte Summers https://t.co/LdAGr4H9tj", 'username': 'guardian', 'followers_count': 9910359}


In [42]:
# find all documents where shoe size>40is greater than 1977
# https://docs.mongodb.com/manual/reference/operator/query/

db.tweets.find_one({'followers_count':{'$gt':90000}})

{'_id': ObjectId('6141c538bd001f2eabba39aa'),
 'text': "Giving booster shots before the world is vaccinated won't keep the UK safe from Covid | Charlotte Summers https://t.co/LdAGr4H9tj",
 'username': 'guardian',
 'followers_count': 9910359}

## Delete

In [47]:
db.students.delete_one({"name":"subha"})

<pymongo.results.DeleteResult at 0x7fa4a7ed0fc0>

In [47]:
db.tweets.count_documents({'followers_count':{'$gt':90000}})

1

In [None]:
# drop the collection
db.students.???