In [13]:
import pandas as pd
import openai
from src import wiki, news, twitter
import os

import warnings
from pprint import pprint

warnings.filterwarnings("ignore")

In [14]:
# Azure OpenAI 
# Insert your API endpoint URL & key
openai.api_type = "azure"
openai.api_version = "2022-12-01"
openai.api_base = ""
openai.api_key = ""

# Read Data

In [31]:
df = pd.read_csv("data/fifa_data.csv", index_col=0)
df = df.iloc[:10]
df.head()

Unnamed: 0,source,text,words
0,wiki,The 2022 FIFA World Cup was an international f...,354
1,wiki,The FIFA World Cup is a professional football ...,129
2,wiki,"Unlike previous FIFA World Cups, which are typ...",276
3,wiki,"In April 2022, FIFA announced the prizes for a...",54
4,wiki,The tournament featured new substitution rules...,99


# Generate Questions 

In [16]:
def get_questions(context):
    try:
        response = openai.Completion.create(
            engine="text-davinci-003-base",
            prompt=f"Write all possible questions that can be answered from the text below. \n\nText: {context}\n\nQuestions:\n1.",
            temperature=0,
            max_tokens=400,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return response['choices'][0]['text']
    except:
        return ""


df['questions']= df.text.apply(get_questions)
df['questions'] = "1." + df.questions

In [17]:
print(df["text"].iloc[4])

The tournament featured new substitution rules whereby teams could make up to five substitutions in normal time, and an additional substitution in extra time. In addition, it was the first World Cup to feature concussion substitutions, whereby each team was permitted to use a maximum of one concussion substitute during a match. A concussion substitution did not count towards a team's quota of regular substitutions. Iranian goalkeeper Alireza Beiranvand suffered a concussion in his country's opening match against England and was replaced by Hossein Hosseini. This was the first use of a dedicated concussion substitute during a World Cup.


In [18]:
# Generated Questions Example:

print(df["questions"].iloc[4])

1. How many substitutions were teams allowed to make in normal time during the tournament?
2. How many additional substitutions were teams allowed to make in extra time?
3. How many concussion substitutions were teams permitted to use during a match?
4. Who was the first goalkeeper to be replaced by a concussion substitute during a World Cup?
5. What match was the first use of a dedicated concussion substitute during a World Cup?


# Get Answers

Get answers based on context.

In [19]:
def get_answers(row):
    try:
        response = openai.Completion.create(
            engine="text-davinci-003-base",
            prompt=f"Write answers to the questions below based on the context given in the text below. \n\nText: {row.text}\n\nQuestions:\n{row.questions}\n\nAnswers:\n1.",
            temperature=0,
            max_tokens=500,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response['choices'][0]['text']
    except Exception as e:
        print (e)
        return ""


df['answers'] = df.apply(get_answers, axis=1)
df['answers'] = "1." + df.answers
df = df.dropna().reset_index().drop('index',axis=1)

In [20]:
# Questions:

print(df["questions"].iloc[4])

1. How many substitutions were teams allowed to make in normal time during the tournament?
2. How many additional substitutions were teams allowed to make in extra time?
3. How many concussion substitutions were teams permitted to use during a match?
4. Who was the first goalkeeper to be replaced by a concussion substitute during a World Cup?
5. What match was the first use of a dedicated concussion substitute during a World Cup?


In [21]:
# Answers:

print(df["answers"].iloc[4])

1. Teams were allowed to make up to five substitutions in normal time.
2. Teams were allowed to make an additional substitution in extra time.
3. Teams were permitted to use a maximum of one concussion substitute during a match.
4. Iranian goalkeeper Alireza Beiranvand was the first goalkeeper to be replaced by a concussion substitute during a World Cup.
5. The first use of a dedicated concussion substitute during a World Cup was in Iran's opening match against England.


# Prepare Q&A

Transform questions & answers from numbered bullets into python lists.

In [22]:
import re

def split_text(text):
    return([x for x in re.split(r'((?:^\s*\d+\. )|(?:\s+\d+\. ))\s*', 
                            text) if x and not re.match(r"\s*\d+\. ",x)])

df["questions"] = df.questions.apply(split_text)
df["answers"] = df.answers.apply(split_text)

Transform dataframe into long format where each row is one question & answer.

In [23]:
df.iloc[4].questions

['How many substitutions were teams allowed to make in normal time during the tournament?',
 'How many additional substitutions were teams allowed to make in extra time?',
 'How many concussion substitutions were teams permitted to use during a match?',
 'Who was the first goalkeeper to be replaced by a concussion substitute during a World Cup?',
 'What match was the first use of a dedicated concussion substitute during a World Cup?']

In [24]:
df["qa_match"] = df.apply(lambda x: len(x["questions"])==len(x["answers"]), axis=1)
df = df.loc[df['qa_match'] == True]
df = df.drop(['qa_match'], axis=1)
df.head()

Unnamed: 0,source,text,words,questions,answers
0,wiki,The 2022 FIFA World Cup was an international f...,354,[What was the 22nd edition of the FIFA World C...,"[The 2022 FIFA World Cup, From 20 November to ..."
1,wiki,The FIFA World Cup is a professional football ...,129,"[When was the first FIFA World Cup held?, How ...",[The first FIFA World Cup was held in 1930 in ...
2,wiki,"Unlike previous FIFA World Cups, which are typ...",276,[What months was the 2022 World Cup played in?...,[The 2022 World Cup was played in November and...
3,wiki,"In April 2022, FIFA announced the prizes for a...",54,[When did FIFA announce the prizes for all par...,[FIFA announced the prizes for all participati...
4,wiki,The tournament featured new substitution rules...,99,[How many substitutions were teams allowed to ...,[Teams were allowed to make up to five substit...


In [25]:
df.head(11)

Unnamed: 0,source,text,words,questions,answers
0,wiki,The 2022 FIFA World Cup was an international f...,354,[What was the 22nd edition of the FIFA World C...,"[The 2022 FIFA World Cup, From 20 November to ..."
1,wiki,The FIFA World Cup is a professional football ...,129,"[When was the first FIFA World Cup held?, How ...",[The first FIFA World Cup was held in 1930 in ...
2,wiki,"Unlike previous FIFA World Cups, which are typ...",276,[What months was the 2022 World Cup played in?...,[The 2022 World Cup was played in November and...
3,wiki,"In April 2022, FIFA announced the prizes for a...",54,[When did FIFA announce the prizes for all par...,[FIFA announced the prizes for all participati...
4,wiki,The tournament featured new substitution rules...,99,[How many substitutions were teams allowed to ...,[Teams were allowed to make up to five substit...
5,wiki,The bidding procedure to host the 2018 and 202...,191,[How many bids were made for the 2018 FIFA Wor...,[11 bids were made for the 2018 FIFA World Cup...
6,wiki,There have been allegations of bribery and cor...,668,[What allegations of corruption have been made...,[Allegations of bribery and corruption in the ...
7,wiki,"At an estimated cost of over $220 billion, it ...",58,[What is the estimated cost of the most expens...,[The estimated cost of the most expensive Worl...
8,wiki,The first five proposed venues for the World C...,459,[What were the terms of reference for the desi...,[The terms of reference for the design of the ...
9,wiki,Team base camps \nBase camps were used by the ...,102,"[What is the purpose of team base camps?, How ...",[The purpose of team base camps is to stay and...


In [26]:
df_long = pd.DataFrame()
for index, cols in df.iterrows():
    df_extract = pd.DataFrame.from_dict({'questions': cols['questions'], 'answers': cols['answers']})
    df_long = pd.concat([df_long, df_extract], ignore_index=True)

df_long.rename(columns={"questions": "prompt", "answers": "completion"}, inplace=True)

In [27]:
df_long.tail()

Unnamed: 0,prompt,completion
68,When did FIFA announce the hotels and training...,FIFA announced the hotels and training sites f...
69,How many teams were within a 10 km radius of e...,24 of the 32 teams were within a 10 km radius ...
70,Was the 2022 World Cup the most compact since ...,"Yes, the 2022 World Cup was the most compact s..."
71,Did players need to take flights to matches fo...,"No, players did not need to take flights to ma..."
72,Could players remain at the same training base...,"Yes, players could remain at the same training..."


Number of questions & answers in the document

In [28]:
len(df_long)

73

# Save generated Q&A

In [None]:
df_long.to_json('fifa-data/fifa_qa_gen.jsonl', orient="records", lines=True)