# ETL pipeline using AI

In this part of the project, the main goal is to use openAI's chatGPT to create movies reviews and recommendations based on the data retrieved in the previous steps.

## 1 Imports

In [None]:
from time import sleep
import tkinter as tk
import tkinter.filedialog
import json
import pandas as pd
import openai

## 2 Extract

Extract movies titles from CSV file.

In [None]:
documentary_df = pd.read_csv('../files/scraped_documentary_movies.csv')
titles = documentary_df['movie'].tolist()

## 3 Transform

1. Get credentials from config file with Tkinter and JSON libraries
2. Integrate openAI API key
3. Create function to generate movie reviews and recommendations based on titles
4. Generate reviews and recommendations and store them into different variables
5. Do some data cleaning with both reviews and recommendations

In [None]:
# Get rid of the root window popup
root = tk.Tk()
root.withdraw()

# Select credentials file
filepath = tk.filedialog.askopenfilename()
file = open(filepath, 'r')

# Open credentials file
credentials = json.load(file)
file.close()
openai_api_key = credentials['openai_api_key']

# Connect openAI API
openai.api_key = openai_api_key

# Function to generate movie reviews and recommendations
def generate_ai_movie_content(title, content='review'):
    review = f"Create a movie review for {title}. Don't include rating (200 words max)"
    recommendation = f'List three movies you would recommend to people who loved {title}. Provide only the recommended movie titles.'
    
    completion = openai.ChatCompletion.create(
        model = 'gpt-3.5-turbo-16k-0613',
        messages = [
            {
                'role': 'system',
                'content': 'You are a film critic.'
            },
            {
                'role': 'user',
                'content': review if content == 'review' else recommendation if content == 'recommendation' else None
            }
        ]
    )
    return completion.choices[0].message.content.strip('\"')

In [None]:
reviews = []
recommendations = []

In [None]:
# Generate reviews and recommendations white doing some data cleaning
for title in titles[4:]:
    review = [generate_ai_movie_content(title, content='review')]
    reviews.append(review)
    recommendations_ = generate_ai_movie_content(title, content='recommendation')
    recommendations_ = [''.join(c for c in movie if c.isalpha() or c == ' ').strip() for movie in recommendations_.split('\n')]
    recommendations.append(recommendations_)
    # Sleep to avoid going over the openai rate per minute (rpm=3 for gpt-3.5)
    sleep(40)

## 3 Load

1. Create new dataframe for reviews and recommendations
2. Combine dataframes
3. Create new csv file

In [None]:
# Create dataframe for new generated content
generated_movie_content_df = pd.DataFrame({
    'review': reviews,
    'recommendations': recommendations
})

# Combine both dataframes into one and export it as a new CSV file
combined_dfs = pd.concat([documentary_df, generated_movie_content_df], axis=1)
combined_dfs.to_csv('../files/documentary_movies_dataset_ai_enriched.csv', index=False)