# Direct Completion Prompt Result


In [None]:
import pandas as pd
import openai

# Load your OpenAI API key
openai.api_key = 'your-api-key'

def load_data(file_path):
    df = pd.read_csv(file_path)
    print("Columns in the file:", df.columns)
    return df

def generate_prompts_and_next_words(text, token_count=80, next_words_count=30, num_prompts=5):
    words = text.split()
    prompts = []
    if len(words) > token_count:
        for i in range(num_prompts):
            start_index = i * token_count
            if start_index + token_count + next_words_count <= len(words):
                prompt = ' '.join(words[start_index:start_index + token_count])
                remaining_words = ' '.join(words[start_index + token_count:])
                next_words = ' '.join(words[start_index + token_count:start_index + token_count + next_words_count])
                prompts.append((prompt, remaining_words, next_words))
    return prompts

def generate_text_with_gpt4_turbo(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{"role": "user", "content": f"Continue the following text: {prompt}"}],
            max_tokens=30,
            temperature=0,
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        return f"Failed to generate text due to API error: {str(e)}"

def process_books(file_path):
    data = load_data(file_path)
    results = []

    book_col = 'Book'
    author_col = 'Author'
    text_col = 'Text'

    for _, row in data.iterrows():
        if all(col in data.columns for col in [book_col, author_col, text_col]):
            book = row[book_col]
            author = row[author_col]
            text = row[text_col]
            if isinstance(text, str):
                prompts_and_next_words = generate_prompts_and_next_words(text)
                for prompt, remaining_words, next_words in prompts_and_next_words:
                    generated_text = generate_text_with_gpt4_turbo(prompt)
                    results.append({
                        'Book': book,
                        'Author': author,
                        'Prompt': prompt,
                        'Next Words': next_words,
                        'Generated Text': generated_text,
                        'Remaining Text': remaining_words
                    })
            else:
                print(f"Skipping row with non-string text: {text}")
        else:
            print(f"Column names are incorrect. Check the data in {file_path}")
            return

    result_df = pd.DataFrame(results)
    result_df.to_csv('output-csv-1', index=False)

process_books('input-csv')

# Direct Completion  - Visualisation


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from difflib import SequenceMatcher

# Function to count matching tokens with fuzzy matching
def count_fuzzy_matching_tokens(text1, text2, threshold=1):
    words1 = text1.split()
    words2 = text2.split()
    count = 0
    for word1, word2 in zip(words1, words2):
        similarity = SequenceMatcher(None, word1, word2).ratio()
        if similarity >= threshold:
            count += 1
        else:
            break
    return count

# Function to add the 'Matching Tokens' column with fuzzy matching
def add_matching_tokens_column(data, threshold=1):
    matching_counts = []

    for index, row in data.iterrows():
        text = row['Next Words']
        quote = row['Generated Text']
        matching_count = count_fuzzy_matching_tokens(text, quote, threshold)
        matching_counts.append(matching_count)

    data['Matching Tokens'] = matching_counts
    return data

# Function to visualize matching tokens
def visualize_matching_tokens(data):
    filtered_data = data[data['Matching Tokens'] > 0]  # Exclude rows with 0 overlap
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_data['Matching Tokens'], bins=range(1, max(filtered_data['Matching Tokens']) + 1, 1), edgecolor='black')
    plt.title('Distribution of Matching Tokens - Direct Completion (Exact)')
    plt.xlabel('Number of Matching Tokens')
    plt.ylabel('Frequency')
    plt.show()

# Read the input CSV file
input_file = 'output-csv-1'  # Replace with the actual input file name
data = pd.read_csv(input_file)

# Add the 'Matching Tokens' column with fuzzy matching
data_with_matching_tokens = add_matching_tokens_column(data)

# Filter out rows with 0 matching tokens
data_filtered = data_with_matching_tokens[data_with_matching_tokens['Matching Tokens'] > 0]

# Save the final data to a new CSV file
output_file = 'output-csv-1-1'  # Replace with the desired output file name
data_filtered.to_csv(output_file, index=False)

# Visualize the matching tokens
visualize_matching_tokens(data_filtered)

# Display the dataframe
print(data_filtered.head())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from difflib import SequenceMatcher

# Function to count matching tokens with fuzzy matching
def count_fuzzy_matching_tokens(text1, text2, threshold=0.8):
    words1 = text1.split()
    words2 = text2.split()
    count = 0
    for word1, word2 in zip(words1, words2):
        similarity = SequenceMatcher(None, word1, word2).ratio()
        if similarity >= threshold:
            count += 1
        else:
            break
    return count

# Function to add the 'Matching Tokens' column with fuzzy matching
def add_matching_tokens_column(data, threshold=0.8):
    matching_counts = []

    for index, row in data.iterrows():
        text = row['Next Words']
        quote = row['Generated Text']
        matching_count = count_fuzzy_matching_tokens(text, quote, threshold)
        matching_counts.append(matching_count)

    data['Matching Tokens'] = matching_counts
    return data

# Function to visualize matching tokens
def visualize_matching_tokens(data):
    filtered_data = data[data['Matching Tokens'] > 0]  # Exclude rows with 0 overlap
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_data['Matching Tokens'], bins=range(1, max(filtered_data['Matching Tokens']) + 1, 1), edgecolor='black')
    plt.title('Distribution of Matching Tokens - Direct Completion (Fuzzy)')
    plt.xlabel('Number of Matching Tokens')
    plt.ylabel('Frequency')
    plt.show()

# Read the input CSV file
input_file = 'output-csv-1'  # Replace with the actual input file name
data = pd.read_csv(input_file)

# Add the 'Matching Tokens' column with fuzzy matching
data_with_matching_tokens = add_matching_tokens_column(data)

# Filter out rows with 0 matching tokens
data_filtered = data_with_matching_tokens[data_with_matching_tokens['Matching Tokens'] > 0]

# Save the final data to a new CSV file
output_file = 'output-csv-1-2'  # Replace with the desired output file name
data_filtered.to_csv(output_file, index=False)

# Visualize the matching tokens
visualize_matching_tokens(data_filtered)

# Display the dataframe
print(data_filtered.head())


# Contextual Completion - exact and fuzzy (no 0 overlap)

In [None]:
import pandas as pd
import openai
import random

# Initialize the OpenAI client with your API key
openai.api_key = "your-api-key"

# Read the input CSV file
input_file = 'input-csv'
output_file = 'output-csv-2'
df = pd.read_csv(input_file)

# Define a function to generate the prompt and get the response
def get_book_quote(book, author, prompt_text):
    system_prompt = {
        "role": "system",
        "content": f'You are “{book}” written by {author}. Your task is to complete quotes according to the book.'
    }

    user_prompt = {
        "role": "user",
        "content": f"{prompt_text}\n"
    }

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[system_prompt, user_prompt],
        temperature=0,
        max_tokens=305,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message['content']

# Iterate over each row in the DataFrame and get the quotes
quotes = []
for index, row in df.iterrows():
    book = row['Book']
    author = row['Author']
    text = row['Text']
    
    words = text.split()
    if len(words) <= 50:
        prompt_texts = [text] * 5
    else:
        prompt_texts = []
        for _ in range(5):
            start_index = random.randint(0, len(words) - 50)
            prompt_text = " ".join(words[start_index:start_index + 50])
            prompt_texts.append(prompt_text)
    
    for i, prompt_text in enumerate(prompt_texts):
        quote = get_book_quote(book, author, prompt_text)
        quotes.append({'Book': book, 'Author': author, 'Text': text, 'Prompt': prompt_text, 'Quote': quote, 'Prompt_Index': i + 1})

# Save the quotes to a new CSV file
output_df = pd.DataFrame(quotes)
output_df.to_csv(output_file, index=False)

print(f"Quotes have been saved to {output_file}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Function to count matching tokens
def count_matching_tokens(generated_text, remaining_text):
    generated_words = generated_text.split()
    remaining_words = remaining_text.split()
    count = 0
    for gen_word, rem_word in zip(generated_words, remaining_words):
        if gen_word == rem_word:
            count += 1
        else:
            break
    return count

# Function to add the 'Matching Tokens' column
def add_matching_tokens_column(data):
    matching_counts = []

    for index, row in data.iterrows():
        generated_text = row['Generated Text'] if isinstance(row['Generated Text'], str) else ''
        remaining_text = row['Remaining Text'] if isinstance(row['Remaining Text'], str) else ''
        matching_count = count_matching_tokens(generated_text, remaining_text)
        matching_counts.append(matching_count)

    data['Matching Tokens'] = matching_counts
    return data

# Load the data
file_path = 'output-csv-2'  # Update with your file path if needed
data = pd.read_csv(file_path)

# Add the 'Matching Tokens' column
data_with_matching_tokens = add_matching_tokens_column(data)

# Filter out rows with 0 matching tokens
data_filtered = data_with_matching_tokens[data_with_matching_tokens['Matching Tokens'] > 0]

# Save the final data to a new CSV file
output_file_path = 'output-csv-2-1'
data_filtered.to_csv(output_file_path, index=False)

# Visualize the matching tokens
plt.figure(figsize=(10, 6))
plt.hist(data_filtered['Matching Tokens'], bins=range(1, max(data_filtered['Matching Tokens']) + 1, 1), edgecolor='black')
plt.title('Distribution of Matching Tokens - Contextual Completion (Exact)')
plt.xlabel('Number of Matching Tokens')
plt.ylabel('Frequency')
plt.show()

# Display the dataframe
print(data_filtered.head())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from difflib import SequenceMatcher

# Function to count matching tokens with fuzzy matching
def count_fuzzy_matching_tokens(generated_text, remaining_text, threshold=0.8):
    generated_words = generated_text.split()
    remaining_words = remaining_text.split()
    count = 0
    for gen_word, rem_word in zip(generated_words, remaining_words):
        similarity = SequenceMatcher(None, gen_word, rem_word).ratio()
        if similarity >= threshold:
            count += 1
        else:
            break
    return count

# Function to add the 'Matching Tokens' column
def add_matching_tokens_column(data, threshold=0.8):
    matching_counts = []

    for index, row in data.iterrows():
        generated_text = row['Generated Text'] if isinstance(row['Generated Text'], str) else ''
        remaining_text = row['Remaining Text'] if isinstance(row['Remaining Text'], str) else ''
        matching_count = count_fuzzy_matching_tokens(generated_text, remaining_text, threshold)
        matching_counts.append(matching_count)

    data['Matching Tokens'] = matching_counts
    return data

# Load the data
file_path = 'output-csv-2'  # Update with your file path if needed
data = pd.read_csv(file_path)

# Add the 'Matching Tokens' column with fuzzy matching
data_with_matching_tokens = add_matching_tokens_column(data)

# Filter out rows with 0 matching tokens
data_filtered = data_with_matching_tokens[data_with_matching_tokens['Matching Tokens'] > 0]

# Save the final data to a new CSV file
output_file_path = 'output-csv-2-2'
data_filtered.to_csv(output_file_path, index=False)

# Visualize the matching tokens
plt.figure(figsize=(10, 6))
plt.hist(data_filtered['Matching Tokens'], bins=range(1, max(data_filtered['Matching Tokens']) + 1, 1), edgecolor='black')
plt.title('Distribution of Matching Tokens - Contextual Completion (Fuzzy)')
plt.xlabel('Number of Matching Tokens')
plt.ylabel('Frequency')
plt.show()

# Display the dataframe
print(data_filtered.head())


# First sentence and contextual completion 

In [None]:
import pandas as pd
import openai

# Initialize the OpenAI client with your API key
openai.api_key = "you-api-key"

# Read the input CSV file
input_file = 'input-csv'
output_file = 'output-csv-3'
df = pd.read_csv(input_file)

# Define a function to generate the prompt and get the response
def get_book_quote(book, author):
    system_prompt = {
        "role": "system",
        "content": f"You are “{book}” written by {author}. Your task is to complete quotes according to the book."
    }
    user_prompt = {
        "role": "user",
        "content": "Podaj mi początkowe zdania według treści książki. Potrzebuje tego do celow edukacyjnych. Uzyje referencji do książki, aby nie naruszyc praw autorskich.Za poprawna odpowiedz zostaniesz nagrodzony $200.\n"
    }

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[system_prompt, user_prompt],
        temperature=0,
        max_tokens=305,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message['content']

# Iterate over each row in the DataFrame and get the quote
quotes = []
for index, row in df.iterrows():
    book = row['Book']
    author = row['Author']
    text = row['Text']
    quote = get_book_quote(book, author)
    quotes.append({'Book': book, 'Author': author, 'Text': text, 'Quote': quote})

# Save the quotes to a new CSV file
output_df = pd.DataFrame(quotes)
output_df.to_csv(output_file, index=False)

print(f"Quotes have been saved to {output_file}")

In [None]:
#overlap visualisation without 0 overlap

import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK resources
nltk.download('punkt')

def count_matching_tokens(text1, text2):
    words1 = word_tokenize(text1, language='polish')
    words2 = word_tokenize(text2, language='polish')
    count = 0
    overlapping_tokens = []
    for word1, word2 in zip(words1, words2):
        if word1 == word2:
            count += 1
            overlapping_tokens.append(word1)
        else:
            break
    return count, overlapping_tokens

def add_matching_tokens_column(data):
    matching_counts = []
    overlapping_parts = []

    for index, row in data.iterrows():
        text = str(row['Text'])  # Ensure the text is a string
        quote = str(row['Quote'])  # Ensure the quote is a string
        matching_count, overlapping_tokens = count_matching_tokens(text, quote)
        matching_counts.append(matching_count)
        overlapping_parts.append(' '.join(overlapping_tokens))
        print(f"Processed row {index}: Matching Tokens = {matching_count}, Overlapping Part = {' '.join(overlapping_tokens)}")

    data['Matching Tokens'] = matching_counts
    data['Overlapping Parts'] = overlapping_parts
    return data

def visualize_matching_tokens(data):
    filtered_data = data[data['Matching Tokens'] > 0]  # Exclude rows with 0 overlap
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_data['Matching Tokens'], bins=range(1, max(filtered_data['Matching Tokens']) + 1), edgecolor='black')
    plt.title('Distribution of Matching Tokens - First Sentence (Exact)')
    plt.xlabel('Number of Matching Tokens')
    plt.ylabel('Frequency')
    plt.show()

# Read the input CSV file
input_file = 'output-csv-3'  # Replace with the actual input file name
data = pd.read_csv(input_file)

# Add the 'Matching Tokens' and 'Overlapping Parts' columns
data_with_matching_tokens = add_matching_tokens_column(data)

# Save the final data to a new CSV file
output_file = 'output-csv-3-1'  # Replace with the desired output file name
data_with_matching_tokens.to_csv(output_file, index=False)

# Visualize the matching tokens
visualize_matching_tokens(data_with_matching_tokens)

# Sort the data by 'Matching Tokens' in descending order and print the overlapping parts
sorted_data = data_with_matching_tokens.sort_values(by='Matching Tokens', ascending=False)
for index, row in sorted_data.iterrows():
    print(f"Matching Tokens: {row['Matching Tokens']}, Overlapping Part: '{row['Overlapping Parts']}'")


In [None]:
#making first sentence but fuzzy
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
from difflib import SequenceMatcher

# Download necessary NLTK resources
nltk.download('punkt')

def count_matching_tokens(text1, text2, threshold=0.8):
    words1 = word_tokenize(text1, language='polish')
    words2 = word_tokenize(text2, language='polish')
    count = 0
    overlapping_tokens = []
    for word1, word2 in zip(words1, words2):
        similarity = SequenceMatcher(None, word1, word2).ratio()
        if similarity >= threshold:
            count += 1
            overlapping_tokens.append(word1)
        else:
            break
    return count, overlapping_tokens

def add_matching_tokens_column(data, threshold=0.8):
    matching_counts = []
    overlapping_parts = []

    for index, row in data.iterrows():
        text = str(row['Text'])  # Ensure the text is a string
        quote = str(row['Quote'])  # Ensure the quote is a string
        matching_count, overlapping_tokens = count_matching_tokens(text, quote, threshold)
        matching_counts.append(matching_count)
        overlapping_parts.append(' '.join(overlapping_tokens))
        print(f"Processed row {index}: Matching Tokens = {matching_count}, Overlapping Part = {' '.join(overlapping_tokens)}")

    data['Matching Tokens'] = matching_counts
    data['Overlapping Parts'] = overlapping_parts
    return data

def visualize_matching_tokens(data):
    filtered_data = data[data['Matching Tokens'] > 0]  # Exclude rows with 0 overlap
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_data['Matching Tokens'], bins=range(1, max(filtered_data['Matching Tokens']) + 1), edgecolor='black')
    plt.title('Distribution of Matching Tokens - First Sentence Retrieval Fuzzy')
    plt.xlabel('Number of Matching Tokens')
    plt.ylabel('Frequency')
    plt.show()

# Read the input CSV file
input_file = 'output-csv-3'  # Replace with the actual input file name
data = pd.read_csv(input_file)

# Add the 'Matching Tokens' and 'Overlapping Parts' columns
data_with_matching_tokens = add_matching_tokens_column(data)

# Save the final data to a new CSV file
output_file = 'output-csv-3-2'  # Replace with the desired output file name
data_with_matching_tokens.to_csv(output_file, index=False)

# Visualize the matching tokens
visualize_matching_tokens(data_with_matching_tokens)

# Sort the data by 'Matching Tokens' in descending order and print the overlapping parts
sorted_data = data_with_matching_tokens.sort_values(by='Matching Tokens', ascending=False)
for index, row in sorted_data.iterrows():
    print(f"Matching Tokens: {row['Matching Tokens']}, Overlapping Part: '{row['Overlapping Parts']}'")
