# Imports

In [48]:
import numpy as np
import pandas as pd
from pymongo import MongoClient

# Conexão com o MongoDB

In [14]:
client = MongoClient()
collec = client['portaisnoticias']['tech']

# Converter collection mongo para um DataFrame

In [149]:
df_dict = {
    '_id': [],
    'spider': [], 
    'url': [], 
    'titulo': [], 
    'qtd_comentarios': [],
    'autor': [],
    'revisor': [], 
    'data_publicacao': [], 
    'referencias': [],
    'tags': [],
    'conteudo_relacionado': [],
}

for doc in collec.find({})[:50]:
    for key, value in doc.items():
        df_dict[key].append(value)
        
    for missing in set(df_dict.keys()).difference(set(doc.keys())):
        df_dict[missing].append(None)
    

In [150]:
mongo_df = pd.DataFrame(df_dict)

In [151]:
mongo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   _id                   50 non-null     object        
 1   spider                50 non-null     object        
 2   url                   50 non-null     object        
 3   titulo                50 non-null     object        
 4   qtd_comentarios       0 non-null      object        
 5   autor                 50 non-null     object        
 6   revisor               32 non-null     object        
 7   data_publicacao       50 non-null     datetime64[ns]
 8   referencias           35 non-null     object        
 9   tags                  50 non-null     object        
 10  conteudo_relacionado  45 non-null     object        
dtypes: datetime64[ns](1), object(10)
memory usage: 4.4+ KB


In [152]:
mongo_df.head()

Unnamed: 0,_id,spider,url,titulo,qtd_comentarios,autor,revisor,data_publicacao,referencias,tags,conteudo_relacionado
0,5e5461eeb27fbae055097ce5,olhardigital,https://olhardigital.com.br/noticia/vazamento-...,Vazamento indica sistema multitarefa do iPad p...,,Luiz Nogueira,Cesar Schaeffer,2020-02-24 19:38:00,[https://www.cultofmac.com/686695/ios-14-multi...,"[iphone, apple, ipad, ios, iphone 11 pro max, ...",[https://olhardigital.com.br/noticia/apple-dev...
1,5e5461eeb27fbae055097ce6,olhardigital,https://olhardigital.com.br/carros-e-tecnologi...,Falta de baterias faz Audi interromper produçã...,,Luiz Nogueira,Cesar Schaeffer,2020-02-24 17:20:00,[https://www.bloomberg.com/news/articles/2020-...,"[carro elétrico, tesla, audi, suv, veículo elé...",[https://olhardigital.com.br/noticia/primeiro-...
2,5e5461eeb27fbae055097ce7,olhardigital,https://olhardigital.com.br/noticia/essential-...,Essential Phone ganhará um gostinho do Android 11,,Rafael Rigues,,2020-02-24 17:49:00,[https://www.androidauthority.com/essential-ph...,"[android, smartphone, android 11]",[https://olhardigital.com.br/games-e-consoles/...
3,5e5461eeb27fbae055097ce8,olhardigital,https://olhardigital.com.br/cinema-e-streaming...,Netflix libera lista de 'Top 10' para identifi...,,Cesar Schaeffer,,2020-02-24 18:00:00,,"[filmes, streaming, netflix, video, filmes onl...",[https://olhardigital.com.br/noticia/netflix-n...
4,5e5461eeb27fbae055097ce9,olhardigital,https://olhardigital.com.br/noticia/huawei-pre...,Huawei prepara lançamento de Smart Speaker na ...,,Rafael Rigues,,2020-02-24 16:25:00,[https://www.engadget.com/2020/02/24/huawei-so...,"[huawei, caixa de som, google home, alexa, cai...",[https://olhardigital.com.br/noticia/huawei-p4...


# Quantidade de noticias por site

In [130]:
mongo_df.groupby('spider')['titulo'].count()

spider
olhardigital    10
Name: titulo, dtype: int64

# Noticias por hora de publicação

In [154]:
times = pd.DatetimeIndex(mongo_df['data_publicacao'])
mongo_df.groupby([times.hour])['titulo'].count()

data_publicacao
11     1
12     2
13     1
14     3
15     6
16    12
17     7
18     7
19    10
20     1
Name: titulo, dtype: int64

In [138]:
mongo_df.groupby('data_publicacao').count()

Unnamed: 0_level_0,_id,spider,url,titulo,qtd_comentarios,autor,revisor,referencias,tags,conteudo_relacionado
data_publicacao,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-02-24 16:25:00,1,1,1,1,0,1,0,1,1,1
2020-02-24 16:30:00,1,1,1,1,0,1,1,1,1,1
2020-02-24 16:40:00,1,1,1,1,0,1,0,1,1,1
2020-02-24 16:50:00,1,1,1,1,0,1,1,1,1,1
2020-02-24 17:02:00,1,1,1,1,0,1,0,1,1,0
2020-02-24 17:11:00,1,1,1,1,0,1,1,1,1,1
2020-02-24 17:20:00,1,1,1,1,0,1,1,1,1,1
2020-02-24 17:49:00,1,1,1,1,0,1,0,1,1,1
2020-02-24 18:00:00,1,1,1,1,0,1,0,0,1,1
2020-02-24 19:38:00,1,1,1,1,0,1,1,1,1,1


# Quantidade de noticias por autor

In [124]:
mongo_df.groupby('autor')['spider'].count()

autor
Cesar Schaeffer    3
Luiz Nogueira      4
Rafael Rigues      3
Name: spider, dtype: int64

In [125]:
mongo_df.groupby('revisor')['spider'].count()

revisor
Cesar Schaeffer    5
Name: spider, dtype: int64

In [128]:
mongo_df.groupby(['autor', 'revisor'])['spider'].count()

autor          revisor        
Luiz Nogueira  Cesar Schaeffer    4
Rafael Rigues  Cesar Schaeffer    1
Name: spider, dtype: int64