In [3]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests 
from newsdataapi import NewsDataApiClient
import time
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import psycopg2
from psycopg2 import sql
from collections import Counter
import configparser

In [21]:
config = configparser.ConfigParser()
config.read('.env')

['.env']

In [22]:
db_password = config['DB']['password']

In [23]:
#initialise tokenizer and model for sentiment analysis
def instantiate_model():
    tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment', num_labels=3)
    return tokenizer, model

In [25]:
#get news data from newsdataapi for a category
def get_api(category):
    try:
        api = NewsDataApiClient(apikey = 'pub_39058ad28ae246ef9dd6ef5cbddd0efecc7ad')
        response = api.news_api(
                            q = category,
                            language = 'en',
                            image  =  False,
                            video = False,
                            size = 30
                            )

        print(f'News Api get request: {category} sucesss')
        return response
    except Exception as e:
        print(f'News Api get request failed: {e}')
        return None


In [26]:
#create dataframe from the api response for a category
def create_dataframe(category):
    try:
        response = get_api(category)
        if response:
            df = pd.DataFrame(response['results'], columns = ["article_id", "title", "description", "source", "link", "pubDate", "country"])
            return df
        else:
            return None
    except Exception as e:
        print(f'Dataframe transformation failed: {e}')
        return None 

In [27]:
#addcolumns for wordCount and articleLength to dataframe
def add_count_sentiment(df):
    df['wordCount'] = df['description'].apply(lambda x: len(x.split()))
    df['articleLength'] = df['description'].apply(lambda x: len(x)) 
    return df

In [28]:
#perform sentiment analysis on news using the tokeniser and model
def perform_sentiment_analysis(df, tokenizer, model):
    with torch.no_grad():
        tokens = tokenizer.encode(df, return_tensors='pt', padding=True, truncation=True)
        results = model(tokens)
    predicted_class = torch.argmax(results.logits, dim=1).item()
    
    if predicted_class == 0:
        sentiment = "negative"
    elif predicted_class == 1:
        sentiment = "neutral"
    else:
       sentiment = "positive"
    return sentiment

In [29]:
#Add sentiment labels to to DataFrame based on api response description
def add_sentiment_label(df, tokenizer, model):
    try:
        df['sentiment'] = df['description'].apply(lambda x: perform_sentiment_analysis(df, tokenizer, model))
        return df
    except Exception as e:
        print(f'Sentiment analysis failed: {e}')
        return None

In [30]:
#insert dataframes to sql database
def insert_to_database(df, table_name):
    try:
        df.to_sql(table_name, connection, if_exists = 'append', index = False)
        print(f'{df} appended to news_analysis_db sucessfully')
        return df 
    except Exception as e:
        print(f'{table_name} failed to append to news_analysis_db: {e}')

In [34]:
#crawl news data, perform sentitnet analyssis, insert datatframe to dataabse
def crawl_and_process_categories(category_name):

    config = configparser.ConfigParser()
    config.read('.env')

    # postgres db params
    db_params = {
    'database': 'news_sentiment_analysis_database',
    'host': 'localhost',
    'user': 'postgres',
    'password' : config['POSTGRES']['password']
        }


    try:
        connection = psycopg2.connect(**db_params)

        # Instantiate tokenizer and model
        tokenizer, model = instantiate_model()

        
        #define category and table names
    
        category_id_mapping = {
                                'business': 101,
                                'crime': 102,
                                'education': 103,
                                'entertainment': 104,
                                'health': 105,
                                'science': 106
                            }

        # Fetch news data
        response = get_api(category_name)

        #Create dataframe
        df = create_dataframe(response)
        if df is not None:

            # Perform sentiment analysis
            df = add_sentiment_labels(df, tokenizer, model)

            if df is not None:
                #Add category ID
                category_id = category_id_mapping[category_name]
                df.insert(0, 'categoryID', category_id)

                #add count and insert df into database
                add_count_sentiment(df)
                insert_to_database(df, category_name)
        else:
            print(f'Data in {category_name} is None')
    except Exception as e:
        print('An error has occured: {e}')
    finally:
        if connection:
            connection.close()

In [None]:
def main():
    crawl_and_process_categories()

In [None]:
if __name__ == "__main__":
    main()