# Functions to interact with MongoDB


## 1. Setting up mongo container

In [9]:
! docker container run --name my_mongo -d --rm -v `pwd`/../mongo_docker/data:/data/db -p 27017:27017 mongo:latest

bc6b1b50cf4e0425643a493558bb395c96ab5e837101a0e092c1b7d92635b6ff


In [10]:
! docker container ls 

CONTAINER ID        IMAGE               COMMAND                  CREATED             STATUS              PORTS                      NAMES
bc6b1b50cf4e        mongo:latest        "docker-entrypoint.s…"   3 seconds ago       Up 2 seconds        0.0.0.0:27017->27017/tcp   my_mongo
d0f0d3339ddd        a1dd85df1762        "python -m uvicorn a…"   About an hour ago   Up About an hour    0.0.0.0:8000->8000/tcp     adoring_nightingale


## 2. Connection with Pymongo

In [11]:
from pymongo import MongoClient

client = MongoClient(host="mongodb://localhost:27017")

In [12]:
client.list_database_names()

['admin', 'config', 'local', 'test']

In [116]:
client["test"].list_collection_names()

['my_collection']

In [117]:
col = client["test"]["my_collection"]
print(col.count_documents(filter={}))
print(col.find_one())

4
{'_id': ObjectId('633c583217bb780a9c10ebc4')}


## 3. Exploring data

In [20]:
import pandas as pd

df = pd.read_csv("../data/CNN_Articels_clean/CNN_Articels_clean.csv")
df.head()

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d..."
1,2,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...
2,3,"Words by Stephanie Bailey, video by Zahra Jamshed",2021-06-16 02:51:30,news,asia,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,"In a Hong Kong warehouse, a swarm of autonomou...","asia, This swarm of robots gets smarter the mo...",This swarm of robots gets smarter the more it ...,"(CNN)In a Hong Kong warehouse, a swarm of aut..."
3,4,"Paul R. La Monica, CNN Business",2022-03-15 09:57:36,business,investing,https://www.cnn.com/2022/03/15/investing/brics...,Russia is no longer an option for investors. T...,"For many years, the world's most popular emerg...","investing, Russia is no longer an option for i...",Russia is no longer an option for investors. T...,"New York (CNN Business)For many years, the wor..."
4,7,Reuters,2022-03-15 11:27:02,business,business,https://www.cnn.com/2022/03/15/business/russia...,Russian energy investment ban part of new EU s...,The European Union formally approved on Tuesda...,"business, Russian energy investment ban part o...",EU bans investment in Russian energy in new sa...,The European Union formally approved on Tuesda...


In [26]:
a = df["Category"] + ", " + df["Headline"] == df["Keywords"]

sum(a)

1286

In [27]:
df.loc[-a, "Keywords"]

0       world, There's a shortage of truckers, but TuS...
1       world, Bioservo's robotic 'Ironhand' could pro...
2       asia, This swarm of robots gets smarter the mo...
3       investing, Russia is no longer an option for i...
5       media, Anti-war protester interrupts live Russ...
                              ...                        
4071    tennis, Australian Open: Australia's vaccine m...
4072    golf, Four golfers test positive ahead of Sout...
4073    tennis, Peng Shuai: 'Unanimous conclusion' tha...
4074    europe, This company is "zapping" cow dung wit...
4075    golf, Tiger Woods: Is this the end of his era?...
Name: Keywords, Length: 2790, dtype: object

We'll build our own keywords:

In [54]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

tokenizer = NLTKWordTokenizer()

stop_words = set(stopwords.words("english"))
print(len(stop_words))

stop_words.update([",", ".", ":", "?", "!", ";", "-", "'s", "'"])

keywords = df["Keywords"].str.lower().apply(tokenizer.tokenize)
keywords = keywords.apply(lambda x: [i for i in x if i not in stop_words])

df["new_keywords"] = keywords

179


In [55]:
df["Category"].value_counts()

sport            2176
news             1611
business          104
politics           75
entertainment      58
health             52
Name: Category, dtype: int64

In [56]:
df["Section"].value_counts()

sport             1088
europe             800
football           618
uk                 376
tennis             197
golf               173
us                 134
australia          114
motorsport         100
politics            75
entertainment       56
health              52
africa              49
opinions            45
world               44
perspectives        25
business            23
weather             18
tech                15
investing            9
asia                 9
success              9
economy              8
living               7
homes                6
china                5
middleeast           5
energy               3
americas             3
business-money       2
media                2
celebrities          1
movies               1
india                1
intl_world           1
business-food        1
cars                 1
Name: Section, dtype: int64

In [59]:
df.columns = [c.replace(" ", "_").lower() for c in df.columns]
df.columns

Index(['index', 'author', 'date_published', 'category', 'section', 'url',
       'headline', 'description', 'keywords', 'second_headline',
       'article_text', 'new_keywords'],
      dtype='object')

## 4. Insertion

In [None]:
import pandas as pd

df = pd.read_csv("../data/CNN_Articels_clean/CNN_Articels_clean.csv")
df.head()

In [102]:
client = MongoClient()
db = client["articles"]

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

tokenizer = NLTKWordTokenizer()

stop_words = set(stopwords.words("english"))
print(len(stop_words))

stop_words.update([",", ".", ":", "?", "!", ";", "-", "'s", "'"])

keywords = df["Keywords"].str.lower().apply(tokenizer.tokenize)
keywords = keywords.apply(lambda x: [i for i in x if i not in stop_words])

df["new_keywords"] = keywords

In [None]:
df.columns = [c.replace(" ", "_").lower() for c in df.columns]
df.columns

In [112]:
category_counter = {}

remaining = []

records = df[["author", "date_published", "category", "section", "url", "headline", "new_keywords", "article_text"]].to_dict(orient="records")


import datetime
from tqdm import tqdm

for c in df["category"].unique():
    db[c].delete_many({})


for r in tqdm(records):
    category = r.pop("category")
    r["keywords"] = r.pop("new_keywords")
    
    if category_counter.get(category, 0) > 9:
        r["category"] = category
        remaining.append(r)
    else:
        r["date_published"] = datetime.datetime.fromisoformat(r["date_published"])
        r["hash"] = hash(r["article_text"])
        r["events"] = {"insertion": {"date": datetime.datetime.utcnow(), "author": "paul_dechorgnat", "mode": "single"}}

        db[category].insert_one(r)
    
    category_counter[category] = category_counter.get(category, 0) + 1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4076/4076 [00:00<00:00, 54437.48it/s]


In [113]:
import json

for i in range(10):
    with open(f"../data/record_{i}.json", "w") as file:
        json.dump(remaining[i], file)

with open("../data/records.json", "w") as file:
    json.dump(remaining[:10], file)

In [114]:
! ls ../data

CNN_Articels_clean  record_2.json  record_5.json  record_8.json
record_0.json	    record_3.json  record_6.json  record_9.json
record_1.json	    record_4.json  record_7.json  records.json


In [115]:
for c in db.list_collection_names():
    print(f"{c}: {db[c].count_documents({})}")

sport: 10
entertainment: 10
politics: 10
news: 10
business: 10
health: 10
