In [None]:
# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from transformers import TextClassificationPipeline
import torch        
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import json
from tqdm.notebook import tqdm
import pandas as pd
import os


sections_scrape = ['better-business', 'business-to-business', 'food', 'global-development', 'lifeandstyle', 'money', 'news', 'politics', 'society', 'world']
dataset_loc = 'Dataset_RAW'
dataset_procesed_loc = 'Dataset_preprocessed'


In [None]:
def get_total_pages(url,params):
    response = requests.get(url, params=params)
    data = json.loads(response.text)
    return data['response']['pages']


def get_data(pages_to_scrape,section):
        all_data = []

        last_page_id = ''
        for page in tqdm(range(1, pages_to_scrape + 1)):

            if page==1:
                url = "https://content.guardianapis.com/search"
                params = {
                    'show-tags': 'all',
                    # 'show-fields': 'production-office,lang,sectionId,sectionName,webTitle,webUrl,headline,trailText,bodyText,firstPublicationDate,productionOffice',
                    'show-fields': 'all',
                    # 'page': 1,
                    'page-size': 200,
                    'api-key': 'test',
                    'from-date': '1990-01-01',
                    'to-date': '2023-01-20',
                    'show-references': 'all',
                    'use-date':'published',
                    'section': str(section)
                }
            else:
                url = f"https://content.guardianapis.com/content/{last_page_id}/next?"

            response = requests.get(url, params=params)

            if response.status_code == 200:
                data = json.loads(response.text)
                articles = data['response']['results']
                last_page_id = data['response']['results'][-1]['id']
                all_data.extend(articles)
            else:
                print(f"Error on page {page}: {response.status_code} - {response.text}")

        return all_data


def download_data(dataset_loc,sections_scrape):

    

    for section in sections_scrape:

        if os.path.isfile(f'{dataset_loc}\\dataset_{str(section)}.json'):
            print(f'Raw Datset for section: {section} Exists.')
            continue
        
        url = "https://content.guardianapis.com/search"
        params = {
            'show-tags': 'all',
            # 'show-fields': 'production-office,lang,sectionId,sectionName,webTitle,webUrl,headline,trailText,bodyText,firstPublicationDate,productionOffice',
            'show-fields': 'all',
            'page': 1,
            'page-size': 200,
            'api-key': 'test',
            'from-date': '1990-01-01',
            'to-date': '2023-01-20',
            'show-references': 'all',
            'use-date':'published',
            'section': str(section)
        }


        total_pages = get_total_pages(url,params)
        print(f"Total pages: {total_pages}")

        # Specify the number of pages you want to scrape
        pages_to_scrape = total_pages

        result_data = get_data(pages_to_scrape,section)
        print(f"Total articles retrieved: {len(result_data)}")

        dataset = {}

        dataset['results'] = result_data

        # Save the dictionary to a JSON file
        with open(f'{dataset_loc}\\dataset_{str(section)}.json', 'w') as json_file:
            json.dump(dataset, json_file)



def make_df(dataset_loc,sections_scrape,dataset_procesed_loc,calculate_sentiment=True):

    
    for section in sections_scrape:
        
        print("*"*100)
        print(f'Working on {section}')
        print("*"*100)
        # Assuming your JSON data is stored in a file named 'data.json'
        with open(f'{dataset_loc}\\dataset_{str(section)}.json', 'r') as file:
            result_data = json.load(file)
        
        small_data = result_data['results']

        titles = [data['fields']['headline'] for data in small_data]
        body_text = [data['fields']['bodyText'] for data in small_data]


        publication_date = []
        count= 0
        for data in small_data:
            if 'firstPublicationDate' in data['fields']:
                publication_date.append(data['fields']['firstPublicationDate'])
                count += 1
            elif 'newspaperEditionDate' in data['fields']:
                publication_date.append(data['fields']['newspaperEditionDate'])
                count += 1
            elif 'webPublicationDate' in data:
                publication_date.append(data['webPublicationDate'])
                count += 1
            else:
                print(json.dumps(data))
                publication_date.append(None)  # or any default value if both keys are missing
                break


        production_office = []

        for data in small_data:
            if 'productionOffice' in data['fields']:
                production_office.append(data['fields']['productionOffice'])
            else:
                production_office.append(None)  # or any default value if both keys are missing


        lang = [data['fields']['lang'] for data in small_data]


        article_tags = []
        for articles in small_data:
            # for article_data in articles['tags']:
            #     # print([tags_data['id'] for tags_data in articles['tags']])
            #     # article_tags.append([tags_data['id'] for tags_data in articles['tags']])
            article_tags.append([tags_data['id'] for tags_data in articles['tags']])


        # Get length of each variable
        length_titles = len(titles)
        length_body_text = len(body_text)
        length_publication_date = len(publication_date)
        length_production_office = len(production_office)
        length_lang = len(lang)
        length_article_tags = len(article_tags)

        # Print the lengths
        print(f'Length of titles: {len(small_data)}')
        print(f'Length of titles: {length_titles}')
        print(f'Length of body_text: {length_body_text}')
        print(f'Length of publication_date: {length_publication_date}')
        print(f'Length of production_office: {length_production_office}')
        print(f'Length of lang: {length_lang}')
        print(f'Length of article_tags: {length_article_tags}')


        df = pd.DataFrame({
            'Title': titles,
            'BodyText': body_text,
            'PublicationDate': publication_date,
            'ProductionOffice': production_office,
            'Lang': lang,
            'ArticleTags': article_tags
        })

        # Save the DataFrame to an Excel file
        excel_file_path = f'{dataset_procesed_loc}\\dataset_{section}.xlsx'
        df = df.applymap(lambda x: x.encode('unicode_escape').
                         decode('utf-8') if isinstance(x, str) else x)
        
        df['PublicationDate'] = df['PublicationDate'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%SZ'))
        df['PublicationDate_dates'] = df['PublicationDate'].dt.date

        df['PublicationDate_dates'] = pd.to_datetime(df['PublicationDate_dates'], format='%Y-%m-%d', errors='coerce')

        df.to_excel(f'{dataset_procesed_loc}\\dataset_{section}_WIHTOUT_Sentiments.xlsx', index=False)
        
        df = get_sentiments(df,section,dataset_procesed_loc,calculate_sentiment)

        # get_word_cloud(section,df)

        # df.to_excel(excel_file_path, index=False)
 

def get_sentiments(df,section,dataset_procesed_loc,calculate_sentiment=True):

    
    if calculate_sentiment==True:
        finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
        tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

        nlp = TextClassificationPipeline(model=finbert, tokenizer=tokenizer, device=0)  # Set device to 0 for GPU

        # section_json_data_df = pd.read_excel(f'dataset_{section}.xlsx')

        sentiments_list = []

        for text in tqdm(df['BodyText']):
            try:
                sentiments_list.append(nlp(text[:1500])[0]['label'])
            except:
                sentiments_list.append(nlp(text[:1000])[0]['label'])

        df['sentiments'] = sentiments_list

        excel_file_path = f'{dataset_procesed_loc}\\dataset_{section}_with_sentiments.xlsx'
        df = df.applymap(lambda x: x.encode('unicode_escape').
                        decode('utf-8') if isinstance(x, str) else x)
        df.to_excel(excel_file_path, index=False)
    else:
        df = pd.read_excel(f'{dataset_procesed_loc}\\dataset_{section}_with_sentiments.xlsx')
        

    df['MonthYear'] = df['PublicationDate_dates'].dt.to_period('M')
    # Group by month and year, and calculate sentiments for each group
    result = df.groupby(['MonthYear', 'sentiments']).size().unstack(fill_value=0).reset_index()

    if 'Negative' not in result.columns:
        result['Negative'] = 0*len(result)
    if 'Positive' not in result.columns:
        result['Positive'] = 0*len(result)
    if 'Neutral' not in result.columns:
        result['Neutral'] = 0*len(result)


    result['MedianSentiment'] = result[['Negative', 'Neutral', 'Positive']].idxmax(axis=1)

    # Calculate the total sentiments for each row
    result['TotalSentiments'] = result[['Negative', 'Neutral', 'Positive']].sum(axis=1)

    # Calculate the percentage of each sentiment for each row
    result['PercentageNegative'] = (result['Negative'] / result['TotalSentiments']) * 100
    result['PercentageNeutral'] = (result['Neutral'] / result['TotalSentiments']) * 100
    result['PercentagePositive'] = (result['Positive'] / result['TotalSentiments']) * 100

    # Drop the 'TotalSentiments' column if you don't need it in the final result
    result.drop(columns=['TotalSentiments'], inplace=True)

    result.to_csv(f'{dataset_procesed_loc}\\dataset_{section}_with_sentiments_groupby.csv', index=False)

    return df   


def get_word_cloud(section,df):

    df['PublicationDate_dates'] = pd.to_datetime(df['PublicationDate_dates'], format='%Y-%m-%d', errors='coerce')
    df['MonthYear'] = df['PublicationDate_dates'].dt.to_period('M')
    df['BodyText'] = df['BodyText'].astype(str)
    # grouped_df = df.groupby('MonthYear')['BodyText'].agg(lambda x: ' '.join((x))).reset_index()

    import re 
    # Create and save individual word clouds for each year
    for year in df['MonthYear'].unique():

        text = ' '.join(df[df['MonthYear'] == year]['BodyText']).encode('utf-8').decode('unicode_escape', errors='ignore')

        # Generate word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        
        # Plot the WordCloud image
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Word Cloud for {year}')
        plt.axis('off')
        
        # Save the WordCloud image
        plt.savefig(f'word_cloud/wordcloud_{year}.png')
        plt.close()  # Close the plot to avoid displaying in the notebook


In [None]:
sections_scrape = ['business-to-business', 'food', 'global-development', 'lifeandstyle', 'money', 'news', 'politics', 'society', 'world']

download_data(dataset_loc,sections_scrape)

make_df(dataset_loc,sections_scrape,dataset_procesed_loc,calculate_sentiment=True)

## Adding data to SQLite

In [5]:
import pandas as pd

df = pd.read_csv('/Users/rishabhshah/Desktop/aipi510proj/food_inflation_analysis/data/dataset_business_with_sentiments_groupby.csv')
del df['Unnamed: 0']

In [6]:
import sqlite3

conn = sqlite3.connect('../food_inflation_analysis.db')
# save train data to the sqlite database as a new table named OCED_USA_FOOD_INFLATION
df.to_sql('news_sentiments', conn, if_exists='replace', index=False)


# query the database to get the train data
query = '''SELECT * FROM news_sentiments'''
new_sentiments_data = pd.read_sql(query, conn)
# print head
print(new_sentiments_data.head(5))

  MonthYear  Negative  Neutral  Positive MedianSentiment
0   1991-12         1        0         0        Negative
1   1992-09         0        1         0         Neutral
2   1994-11         1        0         0        Negative
3   1996-02         1        0         0        Negative
4   1997-02         1        2         1         Neutral
