## Pr√© processamento dos dados

In [5]:
import glob

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
# üîπ 1Ô∏è‚É£ Carregar Dados
user_infos = pd.concat([pd.read_csv(fpath) for fpath in glob.glob('../data/raw/files/treino/*.csv')])
news_item = pd.concat([pd.read_csv(fpath) for fpath in glob.glob('../data/raw/itens/itens/*.csv')])

# üîπ 2Ô∏è‚É£ Criar user_historys (Explode Intera√ß√µes)
user_historys = user_infos[[
    'userId',
    'history',
    'numberOfClicksHistory',
    'scrollPercentageHistory',
    'pageVisitsCountHistory'
]]

user_historys = user_historys.set_index('userId').apply(lambda row: row.str.split(','), axis=1)
user_historys = user_historys.apply(pd.Series.explode).reset_index()


# üîπ 1Ô∏è‚É£ Converter Tipos de Dados de Forma Eficiente
cols_int = ['numberOfClicksHistory', 'pageVisitsCountHistory']
cols_float = ['scrollPercentageHistory']

user_historys[cols_int] = user_historys[cols_int].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
user_historys[cols_float] = user_historys[cols_float].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)

user_historys['history'] = user_historys['history'].str.strip()

# üîπ 2Ô∏è‚É£ Normalizar e Criar Score de Intera√ß√£o
scaler = MinMaxScaler()
interaction_cols = ['scrollPercentageHistory', 'numberOfClicksHistory', 'pageVisitsCountHistory']

user_historys[interaction_cols] = scaler.fit_transform(user_historys[interaction_cols])

# Criar a pontua√ß√£o final diretamente
weights = np.array([0.5, 0.3, 0.2])
user_historys['interaction_score'] = user_historys[interaction_cols].dot(weights)


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# üîπ 2Ô∏è‚É£ Popularidade: Contar quantas vezes cada not√≠cia foi visitada
news_popularity = user_historys[['userId', 'history', 'pageVisitsCountHistory']]['history'].value_counts().rename('popularity_score')

# üîπ 3Ô∏è‚É£ Ajustar 'issued' para evitar erro de timezone
news_item['issued'] = pd.to_datetime(news_item['issued'], errors='coerce')
news_item['issued'] = news_item['issued'].dt.tz_localize(None)  # Remove timezone

# üîπ 4Ô∏è‚É£ Fun√ß√£o mais r√°pida para calcular rec√™ncia
def calc_recency_score(dates, alpha=0.1):
    """Calcula um score de rec√™ncia com base na diferen√ßa de dias at√© hoje"""
    max_days = (pd.Timestamp.today() - dates.min()).days
    return np.exp(-alpha * (pd.Timestamp.today() - dates).dt.days / max_days)

news_item['recency_score'] = calc_recency_score(news_item['issued']).fillna(0)

# üîπ 5Ô∏è‚É£ Normalizar os scores
scaler = MinMaxScaler()
news_item[['recency_score']] = scaler.fit_transform(news_item[['recency_score']])

# üîπ 6Ô∏è‚É£ Juntar Popularidade e Not√≠cias
news_item = news_item.set_index('page').join(news_popularity, on='page', how='left').fillna(0).reset_index()

# üîπ 7Ô∏è‚É£ Normalizar Popularidade
news_item[['popularity_score']] = scaler.fit_transform(news_item[['popularity_score']])

# Testando dados com kmeans


In [7]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# üîπ 1Ô∏è‚É£ Garantir que 'history' √© string
user_historys['history'] = user_historys['history'].astype(str)

# üîπ 2Ô∏è‚É£ Criar Representa√ß√£o Num√©rica das Not√≠cias (TF-IDF) com Limita√ß√£o
vectorizer = TfidfVectorizer(max_features=50_000)  # Reduz n√∫mero de colunas
user_news_matrix = vectorizer.fit_transform(user_historys.groupby('userId')['history'].apply(lambda x: ' '.join(x)))

# üîπ 3Ô∏è‚É£ Reduzir Dimensionalidade com SVD (100 componentes)
svd = TruncatedSVD(n_components=100, random_state=42)
news_embeddings = svd.fit_transform(user_news_matrix)  # Mant√©m formato esparso

# üîπ 4Ô∏è‚É£ Criar DataFrame com Embeddings e Interaction Score
user_embeddings = pd.DataFrame(news_embeddings, index=user_historys['userId'].unique())

# üîπ 5Ô∏è‚É£ Adicionar Interaction Score e Normalizar
user_embeddings['interaction_score'] = user_historys.groupby('userId')['interaction_score'].mean().values

# üîπ 6Ô∏è‚É£ Converter todos os nomes das colunas para string
user_embeddings.columns = user_embeddings.columns.astype(str)

# üîπ 7Ô∏è‚É£ Normalizar os Dados
scaler = MinMaxScaler()
user_embeddings.iloc[:, :] = scaler.fit_transform(user_embeddings)

# üîπ 8Ô∏è‚É£ Aplicar K-Means
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
user_embeddings['cluster'] = kmeans.fit_predict(user_embeddings)

# üîπ 9Ô∏è‚É£ Juntar os Clusters no DataFrame Original
user_historys = user_historys.merge(user_embeddings[['cluster']], left_on='userId', right_index=True, how='left')


In [8]:
def recomendar_noticias_por_cluster(user_id, user_historys, news_item):
    # Encontrar o cluster do usu√°rio
    user_cluster = user_historys[user_historys['userId'] == user_id]['cluster'].values[0]

    # Obter usu√°rios no mesmo cluster
    similar_users_cluster = user_historys[user_historys['cluster'] == user_cluster]['userId'].unique()

    # Obter not√≠cias consumidas pelos usu√°rios do cluster
    similar_users_cluster_news = user_historys[user_historys['userId'].isin(similar_users_cluster)]['history'].unique()

    # Valida√ß√£o
    noticias_futuras_validacao = [
        '9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6',
        'b8eba39e-3905-424f-9f7f-966f07637244',
        '1603a1f9-09cb-47b6-ad1a-8f9a3c0bbfc0'
    ]

    print('Teste passou' if set(similar_users_cluster_news) & set(noticias_futuras_validacao) else 'Teste falhou')

    return similar_users_cluster ,similar_users_cluster_news
user_id = 'a120515626fe5d12b22b7d5a7c5008912cc69284aa26ccdff8edab753db8c7e7'  # Troque pelo ID real

# Teste com um usu√°rio real
cluster_users , cluster_news = recomendar_noticias_por_cluster(user_id, user_historys, news_item)
print(f"Not√≠cias recomendadas por cluster: {cluster_news}")


Teste passou
Not√≠cias recomendadas por cluster: ['c8aab885-433d-4e46-8066-479f40ba7fb2'
 '68d2039c-c9aa-456c-ac33-9b2e8677fba7'
 '13e423ce-1d69-4c78-bc18-e8c8f7271964' ...
 '59eb253d-bb44-4048-8c97-cca1cb2464b8'
 '7da17f35-ef13-44a3-abc6-bf096fe42532'
 '489989dd-63d0-41b3-bb92-2fe7b5dd965e']


In [9]:
news_item.set_index('page').loc[cluster_news].head()

Unnamed: 0_level_0,url,issued,modified,title,body,caption,recency_score,popularity_score
page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
c8aab885-433d-4e46-8066-479f40ba7fb2,http://g1.globo.com/sc/santa-catarina/noticia/...,2022-03-19 21:03:21,2022-03-19 21:03:21+00:00,"Voc√™ viu? 'Musa das Estradas' faz v√≠deo de p√©,...",Caminhoneira Aline F√ºchter em p√© em casa\nRepr...,Caminhoneira Aline F√ºchter ficou em p√© em fren...,0.947771,0.013281
68d2039c-c9aa-456c-ac33-9b2e8677fba7,http://g1.globo.com/rj/rio-de-janeiro/noticia/...,2021-11-01 03:01:20,2021-11-01 13:20:44+00:00,'Mulher-Gato' foi proibida de entrar na Mar√© a...,"Pol√≠cia Civil do Rio prende Mulher-Gato, apont...","Luana Rabello, segundo a pol√≠cia, √© muito famo...",0.898899,0.005304
13e423ce-1d69-4c78-bc18-e8c8f7271964,http://g1.globo.com/sc/santa-catarina/noticia/...,2022-02-01 18:33:21,2022-02-04 20:23:50+00:00,Caminhoneira 'Musa das Estradas' mostra rosto ...,Caminhoneira 'Musa das Estradas' mostra rosto ...,"Aline F√ºchter chegou a Tubar√£o, onde mora, no ...",0.931578,0.009612
3325b5a1-979a-4cb3-82b6-63905c9edbe8,http://g1.globo.com/sp/itapetininga-regiao/not...,2022-08-14 20:17:10,2022-08-14 20:17:11+00:00,Agosto Lil√°s: Itapetininga promove palestras d...,Itapetininga promove palestras de conscientiza...,"Segunda prefeitura, durante m√™s de agosto, pal...",1.0,0.000957
fe856057-f97d-419f-ab1c-97c5c3e0719c,http://g1.globo.com/sp/itapetininga-regiao/not...,2022-08-14 11:39:11,2022-08-15 15:18:15+00:00,Designer de sobrancelhas viraliza na web ao fa...,Designer de sobrancelhas viraliza na web ao fa...,"V√≠deo publicado por Geizielle Ferreira Mendes,...",0.999646,0.118649


# Testando com Dados com Knn

> N√£o teve muito retorno mas poderia ser investida

In [14]:
from sklearn.neighbors import NearestNeighbors

# üîπ 5Ô∏è‚É£ Aplicar KNN (Buscar Usu√°rios Similares)
knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(user_historys.set_index('userId').loc[cluster_users][['interaction_score']])

# Exemplo: Encontrar usu√°rios similares a um usu√°rio espec√≠fico
user_id = 'a120515626fe5d12b22b7d5a7c5008912cc69284aa26ccdff8edab753db8c7e7'  # Troque pelo ID real de um usu√°rio
user_idx = user_historys[user_historys['userId'] == user_id].index[0]
distances, indices = knn.kneighbors([user_historys.loc[user_idx, ['interaction_score']]])

# Mostrar usu√°rios similares
similar_users = user_historys.iloc[indices[0]]['userId'].tolist()
print(f"Usu√°rios similares a {user_id}: {similar_users}")


Usu√°rios similares a a120515626fe5d12b22b7d5a7c5008912cc69284aa26ccdff8edab753db8c7e7: ['f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e', '52f801c476a3db5973c60ffd0b9e76fea50de7ce331dc20f5f80ab0a6ddd354e']




In [15]:
def recomendar_noticias_por_similaridade(user_id, user_historys, news_item, knn):
    # Encontrar usu√°rios similares
    user_idx = user_historys[user_historys['userId'] == user_id].index[0]
    _, indices = knn.kneighbors([user_historys.loc[user_idx, ['interaction_score']]])

    # Obter not√≠cias consumidas por usu√°rios similares
    similar_users = user_historys.iloc[indices[0]]['userId'].tolist()
    similar_users_news = user_historys[user_historys['userId'].isin(similar_users)]['history'].unique()

    # Valida√ß√£o
    noticias_futuras_validacao = [
        '9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6',
        'b8eba39e-3905-424f-9f7f-966f07637244',
        '1603a1f9-09cb-47b6-ad1a-8f9a3c0bbfc0'
    ]

    print('Teste passou' if set(similar_users_news) & set(noticias_futuras_validacao) else 'Teste falhou')

    return similar_users_news

# Teste com um usu√°rio real
user_id = 'a120515626fe5d12b22b7d5a7c5008912cc69284aa26ccdff8edab753db8c7e7'  # Substitua pelo ID real de um usu√°rio
similar_users_news = recomendar_noticias_por_similaridade(user_id, user_historys, news_item, knn)
print(f"Not√≠cias recomendadas por similaridade: {similar_users_news}")




Teste falhou
Not√≠cias recomendadas por similaridade: ['c8aab885-433d-4e46-8066-479f40ba7fb2'
 '68d2039c-c9aa-456c-ac33-9b2e8677fba7'
 '13e423ce-1d69-4c78-bc18-e8c8f7271964'
 'c3d1bd47-feb1-4c0a-9e78-36d20b3f0fc9'
 '286428b0-dd16-46e6-8189-2908a23967ea'
 '68bc8994-ebef-4e48-8478-e7fe1619ae58'
 '44fdcedf-e9ae-4748-8ab3-0cdb466672a6'
 '51219799-daab-48b2-b700-3a61833b3ea8'
 '3ce73782-d80e-4031-be53-5761b158cae7'
 'ecc37a22-b730-4e3a-bc87-c3ba3403acbc'
 '7594da99-d606-4338-a373-710a7dec776a'
 'bf257382-74fb-4392-ad6a-143240e39f81'
 '3d34afb1-b073-43e8-9691-8fb2e2459000'
 '4c46d054-1fe0-4d63-9122-fa130fd4f728'
 'aed49799-59f8-4f15-94be-566e753d9325'
 '4d89c4b6-6827-4935-9ba1-0502025af270'
 '66a9efac-fd43-4fd1-9824-c404b08efa5d'
 '9d598d19-d6be-4c7e-a963-b8fedfa8f24f'
 'd8b6f5a1-2f96-4d02-a78f-dbe1c87946f8'
 'a2ef8430-00b6-49de-852d-2c72596c5917'
 '557c0d37-0427-407d-a235-c78028d91220'
 '5af379e6-1bd1-4cf8-a23c-03266fb77b2c'
 'ad42c4b0-dfb5-49fb-87bf-7b5d055b6e8e'
 '6a64daa7-32ec-4d35-a7ec-

In [10]:
from sqlalchemy import create_engine

# üíæ Salvar no SQLite
engine = create_engine('sqlite:///../data/refined/datawarehouse.db', echo=False)

# Salvar os DataFrames no banco de dados
user_historys.to_sql('user_historys', con=engine, if_exists='replace', index=False)
news_item.to_sql('news_item', con=engine, if_exists='replace', index=False)

255603