# Movie Embeddings

Using ML.NET to apply Principal Component Analysis (PCA) for dimensionality reduction on OpenAI Ada embedding vectors and plotting them in a 2-D scatterplot

## Install packages

In [1]:
#r "nuget: Microsoft.ML, 3.0.0-preview.23266.6"
#r "nuget: Azure.AI.OpenAI, 1.0.0-beta.7"
#r "nuget: Plotly.NET, 4.2.0"
#r "nuget: Plotly.NET.Interactive, 4.2.0"
#r "nuget: Plotly.NET.CSharp, 0.11.1"

Loading extensions from `/home/vscode/.nuget/packages/plotly.net.interactive/4.2.0/interactive-extensions/dotnet/Plotly.NET.Interactive.dll`

In [2]:
using Microsoft.ML;
using Microsoft.ML.Data;
using Azure.AI.OpenAI;
using Plotly.NET.CSharp;

## Configure OpenAI Client

In [3]:
var AOAI_ENDPOINT = Environment.GetEnvironmentVariable("AOAI_ENDPOINT");
var AOAI_KEY = Environment.GetEnvironmentVariable("AOAI_KEY");
var AOAI_DEPLOYMENTID = Environment.GetEnvironmentVariable("AOAI_DEPLOYMENTID");

In [4]:
var endpoint = new Uri(AOAI_ENDPOINT);
var credentials = new Azure.AzureKeyCredential(AOAI_KEY);
var openAIClient = new OpenAIClient(endpoint, credentials);

## Define data types

In [5]:
public class MLEmbedding
{
    public string Title {get;set;}
    
    [VectorType(1536)]
    public float[] Embedding {get;set;}
}

## Define movie titles

In [6]:
var movieTitles = new string[] {
    "The Lion King",
    "Inception",
//    "Frozen",
    "Titanic",
    "Shrek",
    "The Shawshank Redemption",
    "Toy Story",
    "The Dark Knight",
    "Jurassic Park",
    "Pulp Fiction"
};

## Generate embeddings

In [7]:
var movieEmbeddings = 
    movieTitles
        .Select(title => {
            var embeddingOptions = new EmbeddingsOptions(title);
            Embeddings embeddingResponse = openAIClient.GetEmbeddings(AOAI_DEPLOYMENTID,embeddingOptions);
            var embedding = embeddingResponse.Data[0].Embedding.ToArray();
            return new MLEmbedding{Title = title, Embedding = embedding};
        });

In [8]:
movieEmbeddings

index,value
,
,
,
,
,
,
,
,
,
0,"Submission#5+MLEmbeddingTitleThe Lion KingEmbedding[ -0.0072870334, -0.02391955, -0.009901325, -0.021671748, -0.010072354, 0.020792173, -0.01705398, -0.027462283, -0.012643889, -0.015136017, 0.022514673, 0.007372548, 0.024872422, -0.020877687, -0.0049628792, 0.0039244923, 0.0327153, 0.0015575805, 0.027291253, 0.004813229 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,The Lion King
Embedding,"[ -0.0072870334, -0.02391955, -0.009901325, -0.021671748, -0.010072354, 0.020792173, -0.01705398, -0.027462283, -0.012643889, -0.015136017, 0.022514673, 0.007372548, 0.024872422, -0.020877687, -0.0049628792, 0.0039244923, 0.0327153, 0.0015575805, 0.027291253, 0.004813229 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,Inception
Embedding,"[ 0.0097372085, -0.014701405, -0.0112139415, -0.028585332, -0.01925027, 0.013897113, -0.030589469, -0.026172455, -0.0014190482, -0.012117121, 0.028189778, 0.022230105, 0.012710452, -0.009631727, 0.013593854, 0.0075418865, 0.03425493, 0.0048719, 0.010963424, -0.013738891 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,Titanic
Embedding,"[ -0.015436796, -0.040282685, 0.003478291, -0.03432181, -0.010611962, 0.00022699941, -0.026422983, 0.015597179, 0.0073976335, -0.020208169, 0.0040496527, 0.026449714, 0.019780483, -0.013485478, -9.621894E-05, 0.0019780484, 0.017708879, 0.0055565783, 0.026422983, -0.033332787 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,Shrek
Embedding,"[ 0.007319695, -0.057858665, -0.01167532, -0.018146355, -0.0067955223, -0.0027862266, -0.0191947, -0.026558075, -0.00029991718, -0.031899642, 0.022127569, 0.006224549, 0.02840516, -0.0018985654, 0.0102775255, 0.011438194, 0.031050982, -0.00024395084, 0.012417897, -0.02085458 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,The Shawshank Redemption
Embedding,"[ -0.0013066616, -0.04665496, -0.0036220287, -0.029472824, -0.019280242, 0.011694777, -0.009211809, -0.02209841, -0.023091597, -0.009360787, 0.017927025, 0.0044693416, 0.0127872825, 0.009050416, -0.005490462, 0.012638304, 0.03344557, -0.017629068, 0.012092051, -0.029224526 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,Toy Story
Embedding,"[ -0.009728087, -0.028136225, -0.008030655, -0.03114531, -0.017012902, 0.0014498901, -0.015984153, -0.0046647238, 0.0014756088, -0.027441822, 0.012062057, -0.015971296, 0.011168333, -0.011116896, 0.006719003, 0.013990957, 0.02051064, -0.0061210436, 0.03659767, -0.025075704 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,The Dark Knight
Embedding,"[ -0.011200612, -0.02613683, -0.012652658, -0.031771265, -0.0158484, 0.0031259325, -0.01756107, -0.02554112, -0.015935276, -0.017871337, 0.030430915, 0.00040334614, 0.015711883, -0.003844199, -0.007886969, -0.007986254, 0.02653397, 0.004138952, 0.023121042, -0.012758149 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,Jurassic Park
Embedding,"[ -0.00052101345, -0.04108107, -0.005039389, -0.017897967, -0.026068226, 0.007334082, -0.019366061, -0.02364268, -0.0054606674, -0.007959617, 0.004267044, -0.001073144, 0.018242652, -0.007021314, 0.014412843, 0.013353263, 0.037021473, 0.0029425689, 0.02184267, -0.0045415135 ... (1516 more) ]"

Unnamed: 0,Unnamed: 1
Title,Pulp Fiction
Embedding,"[ -0.0002570929, -0.028310657, -0.0059965346, -0.026362676, -0.015661767, 0.008084121, -0.011129465, -0.012135922, -0.014428046, -0.030492395, 0.016986394, 0.013895597, 0.010025609, -0.009162004, -0.008304892, -0.00028976216, 0.04095955, 0.0033862402, 0.023193961, -0.022466714 ... (1516 more) ]"


## Initialize MLContext

In [9]:
var ctx = new MLContext();

## Load movie title embedding data into IDataView

In [10]:
var dv = ctx.Data.LoadFromEnumerable(movieEmbeddings);

## Inspect IDataView Schema

In [11]:
dv.Schema

## Define Principal Component Analysis pipeline

In [12]:
var pipeline = 
    ctx.Transforms.ProjectToPrincipalComponents(
        outputColumnName:"PCA",
        inputColumnName:"Embedding",
        rank:2);


## Run PCA on movie embedding data

In [13]:
var transformedDv = 
    pipeline
        .Fit(dv)
        .Transform(dv);

## Get the titles and computed principal components

In [14]:
var titles = transformedDv.GetColumn<string>("Title");
var pc = transformedDv.GetColumn<float[]>("PCA");

## Plot movies

In [15]:
var x = pc.Select(x => x[0]);
var y = pc.Select(x => x[1]);

In [16]:
Chart.Point<float, float, string>(
    x: x, 
    y: y,
    MultiText: titles.ToList(),
    TextPosition: Plotly.NET.StyleParam.TextPosition.BottomCenter
)
.WithXAxisStyle<int,int,string>(Title: Plotly.NET.Title.init("Component 1"))
.WithYAxisStyle<int,int,string>(Title: Plotly.NET.Title.init("Component 2"))