In [35]:
import os
import pandas as pd
import openai
from docx import Document
from pptx import Presentation
import time
from transformers import GPT2Tokenizer

In [43]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY #input your key here

In [44]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [45]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def chunk_text(text, max_chunk_size):
    #TODO: improve chunking logic
    words = word_tokenize(text)
    chunks = [words[i:i + max_chunk_size] for i in range(0, len(words), max_chunk_size)]
    return ["".join(chunk) for chunk in chunks]

def chat(input_value, max_tokens, engine = 'text-davinci-003'):
#Note: The openai-python library support for Azure OpenAI is in preview. 
    openai.api_type = "azure"
    openai.api_base = "https://eastus.api.cognitive.microsoft.com/"
    openai.api_version = "2022-12-01"
    openai.api_key = os.getenv("OPENAI_API_KEY")

    response = openai.Completion.create(
      engine=engine,
      prompt=f"{input_value}\n\n",
      temperature=1,
      max_tokens=max_tokens,
      top_p=0.5,
      frequency_penalty=0,
      presence_penalty=0,
      stop=None)
    return response.choices[0].text


def word_tokenize(text):
    ids = tokenizer(text)['input_ids']
    return [tokenizer.decode(x) for x in ids]

## Functions for extracting useful information from data

In [112]:
def get_slope(data):
    summaries = {}
    for col in data.columns:
        model = sm.OLS(data[col], sm.tools.add_constant(data.index)).fit()
        summaries[col] = model.summary()
    return summaries

In [113]:
def get_outliers(data):
    summary = data.describe()
    # Calculate Z-score for each data point
    zscore = (data - summary.loc['mean'])/summary.loc['std']
    # Identify data points with Z-score > 3
    outlier_dict = {}
    for col in data.columns:
        outliers = data[col][data[col] > 3]
        outlier_dict[col] = outliers
    return outlier_dict

In [None]:
def geo_analysis(data, data_explanation = ""):
    prompt = 'Given the columns of a dataframe below, return the columns that are related to geography as a python list'
    geo_columns = chat(prompt + str(list(data.columns)), max_tokens = 1000)
    geo_columns = eval(geo_columns)
    results = {}
    for col in geo_columns:
        agg_data = data.groupby(col).mean(numeric_only=True)
        if agg_data.shape[1] == 0:
            continue
        agg_data = agg_data.sort_values(agg_data.columns[0], ascending=False)
        agg_data = agg_data.to_dict()
        results[col] = chat(data_explanation + "Summarize the key insights from the following data. The data is sorted from highest value to lower." + str(agg_data), 1000)
    return results    
    

In [227]:
# sepearate analysis if we're looking at geo vs nongeo data - we would likely want to expand the classification at some point
def is_geo_data(data):
    prompt = "Below is the first few rows of a dataframe. Return 1 if the data is geography-related, and 0 otherwise"
    return chat(prompt + str(data.head()), 10).replace('\n', '')

## Reading in data (i downloaded this manually)

In [114]:
mortality = pd.read_csv("/Users/amitmisra/Downloads/mortality.csv")

In [150]:
# this are examples of views that are showed - not sure if we have the 
mortality_by_country = mortality[mortality.Period == 2019][['ParentLocation', 'Location', 'FactValueNumeric']]
mortality_over_time = mortality.groupby('Period').FactValueNumeric.mean().reset_index()

In [220]:
def non_geo_analysis(data, data_explanation=""):
    slope_results = get_slope(data)
    slope_gpt = {}
    for key in slope_results:
        slope_gpt[key] = chat(data_explanation + f"Given the following model results for a linear regression model for column {key}, describe the key insights in layment's terms" + str(slope_results[key])[:3500], 500)
    outlier_results = get_outliers(data)
    outlier_gpt = {}
    for key in outlier_results:
        outlier_gpt[key] = chat(data_explanation + f"Given the following outliers for column {key}, return the main outliers with their corresponding {key} values:" + str(outlier_results[key])[:3500], 500)
    return slope_gpt, outlier_gpt

In [221]:
def return_insights(data, data_explanation = ""):
    if is_geo_data(data) == "1":
        gpt_results = geo_analysis(data, data_explanation)
    else:
        gpt_results = non_geo_analysis(data, data_explanation)
    final_result = chat(data_explanation + "Identify the key insights from the following analysis of the data and explain in simple terms" + str(gpt_results)[:3500], 500)
    return final_result


In [222]:
print(return_insights(mortality_over_time))


The key insight from this analysis is that the average period of time has decreased from 1990 to 2019. In 1990, the average period of time was 13.37 years, but in 2019, it was only 5.62 years, representing a decrease of over 50%. This suggests that the average period of time has been decreasing over the past few decades.


In [223]:
print(return_insights(mortality_over_time, "This is data on mortality rates for 5-14 year olds over time. FactValueNumeric is the mortality rate. "))


The key insights from the data are that mortality rates for 5-14 year olds have decreased significantly over time. In 1990, the mortality rate was 13.37 per 1000 people, and by 2019, it had decreased to 5.62 per 1000 people. This indicates that mortality rates for this age group have decreased by more than 50% over the past 30 years.


In [224]:
print(return_insights(mortality_by_country))


The key insights from this analysis are that countries in Africa tend to have higher FactValueNumeric values than those in other regions. This means that African countries generally have higher values for the data being analyzed than other countries in the world. Additionally, the highest FactValueNumeric value was found in Africa at 13.57, while the lowest value was found in Luxembourg at 0.38.


In [225]:
print(return_insights(mortality_by_country, "This is data on mortality rates for 5-14 year olds by country. FactValueNumeric is the mortality rate. "))


The key insights from this analysis are that mortality rates for 5-14 year olds vary widely by country and region. Africa has the highest mortality rate at 13.57, while Europe has the lowest mortality rate at 1.30. Countries in Africa and the Middle East have higher mortality rates than those in Europe and North America. This suggests that there are disparities in access to healthcare and other resources that could be contributing to higher mortality rates in certain countries and regions.
