In [9]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv, find_dotenv
from openai import OpenAI

import pandas as pd
import numpy as np
import json
import yaml
import os

import sys 
sys.path.append("../src")

from utils import count_tokens, group_documents

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# Load Environment Variables and config
_ = load_dotenv(find_dotenv())
OpenAI.api_key = os.getenv("OPENAI_API_KEY")


with open('../config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [12]:
def get_completion(prompt, model="gpt-3.5-turbo", temperature = 0): 
    
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model=model,
    temperature = temperature
    )
    
    return chat_completion


def get_prompt(chunk):
    
    prompt = f"""

        Your task is to generate a summary for a set of 
         speeches of a European Member of Parliament.

        Summarize the text, delimited by triple 
        backticks, so that it becomes clear which are the topics 
        the European Member of Parliament
        cares the most.

        Makes sure the summary do not exceed the 500 words.

        Speeches: ```{chunk}```
        """
    
    return prompt

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


# Get Data

In [13]:
current_directory = os.path.dirname(os.path.abspath(os.getcwd()))

# Speeches Paths
input_path = os.path.join(config['meps_speeches_dataframe']['output_dir'], config['meps_speeches_dataframe']['filename'])
input_path_tr = os.path.join(config['meps_speeches_en_translation']['output_dir'], config['meps_speeches_en_translation']['filename'])
input_meps_list = config['meps_list']['output_dir']

# Speeches Dataframe
speeches = pd.read_csv(os.path.join(os.path.dirname(current_directory), input_path))
translations = pd.read_csv(os.path.join(os.path.dirname(current_directory), input_path_tr), sep = '|', names = ['Url', 'Translation'])

# Join Speeches to translations and add metadata
df = pd.merge(speeches, translations, on = 'Url', how = 'left')

In [14]:
# If language is English replace column Translation with the actual Content

en_translation = []

for idx in df.itertuples():

    if idx.Language == 'EN':
        txt = idx.Content
        en_translation.append(txt)
    else:
        txt = idx.Translation
        en_translation.append(txt)

df['Translation'] = en_translation

# Column names in capital letters
df.columns = [x.upper() for x in df.columns]

df.head(2)

Unnamed: 0,MP,DATE,LANGUAGE,TITLE,URL,CONTENT,COUNTRY,POLITICALGROUP,ID,NATIONALPOLITICALGROUP,TRANSLATION
0,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary, Berichterstatter. – Herr Pr...",Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,
1,Daniel CASPARY,2023-11-21,DE,EU/New Zealand Free Trade Agreement (debate),https://www.europarl.europa.eu/doceo/document/...,"Daniel Caspary (PPE), Frage nach dem Verfah...",Germany,Group of the European People's Party (Christia...,28219,Christlich Demokratische Union Deutschlands,


# Summarization with Chat GPT

In [15]:
# Filter Italian speeches
ita = df[df['COUNTRY'] == 'Italy']

In [293]:
# # Italian Meps
# ita_meps = ita['MP'].unique()

# # Select Mep
# mep = np.random.choice(ita_meps)
# print(f"Selected Mep: {mep}")


In [17]:
# Italian Meps
target_meps = ita['MP'].unique()


# Subset for specific Mep

for mep in target_meps[2:]:

    sub = df[df['MP'] == mep].sort_values(by = 'DATE')
    print(f"Processing {mep}")
    print(f"Found a total of {len(sub)} speeches")

    # Get list of speeches
    speeches = sub['TRANSLATION'].tolist()
    print(f'Found a total of {len(speeches)}')

    chunks = group_documents(speeches, 3000)


    verbose = False
    summaries = []

    for chunk in chunks:
        
        response = get_completion(get_prompt(chunk))

        try:
            summary = response.choices[0].message.content
            summaries.append(summary)
            
            if verbose:
                print(summary)
                print("-----------")

        except:
            print("Could process summarization task")


    if len(summaries) > 1:

        tot_contents = " ".join(summaries)
        print("Total token of concatenate summaries", count_tokens(tot_contents))
    
        response = get_completion(get_prompt(chunk))
        final_summary = response.choices[0].message.content
        print("Generate final summary from concatenated contents")

    else:
        final_summary = summaries[0]
        
    print(f"Completing and saving for {mep}")


    # create final output and save as json
    filename = f'speeches_summary_{mep}.json'
    outpath = os.path.join(os.path.dirname(current_directory), 
                        config['meps_speeches_summaries']['output_dir'])
    out_dict = {'mep': mep,
            'summaries': summaries,
            'final_summary': 'ah'
            }

    if not os.path.exists(outpath):
        os.makedirs(outpath)

    with open(os.path.join(outpath, filename), 'w') as json_file:
        json.dump(out_dict, json_file)

    

In [2]:
# Create a single Json with all summaries

current_directory

NameError: name 'current_directory' is not defined