# MLFLow Setup

In [2]:
import mlflow 
import mlflow.sklearn

experiment_name = "experiment_few_shot"

if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(name=experiment_name) 
    
mlflow.start_run() 
mlflow.autolog()

# Importing libraries

In [4]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
import re 
from random import shuffle
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

2022/12/14 22:57:41 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [5]:
video_dataset = pd.read_csv('USvideos.csv')

In [6]:
all_categories = {
    "1":"Film & Animation",
    "2":"Autos & Vehicles",
    "10":"Music",
    "15":"Pets & Animals",
    "17":"Sports",
    "18":"Short Movies",
    "19":"Travel & Events",
    "20":"Gaming",
    "21":"Videoblogging",
    "22":"People & Blogs",
    "23":"Comedy",
    "24":"Entertainment",
    "25":"News & Politics",
    "26":"How to & Style",
    "27":"Education",
    "28":"Science & Technology",
    "29":"Nonprofits & Activism",
    "30":"Movies",
    "31":"Anime/Animation",
    "32":"Action/Adventure",
    "33":"Classics",
    "34":"Comedy",
    "35":"Documentary",
    "36":"Drama",
    "37":"Family",
    "38":"Foreign",
    "39":"Horror",
    "40":"Sci-Fi/Fantasy",
    "41":"Thriller",
    "42":"Shorts",
    "43":"Shows",
    "44":"Trailers"
  }

In [7]:
video_dataset["cateogory_name"] = ""
 
for index, row in tqdm(video_dataset.iterrows(), total=video_dataset.shape[0]):
    cat_id = row['category_id']
    cat_name = all_categories[str(cat_id)]
    video_dataset.at[index,'cateogory_name'] = cat_name

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40949/40949 [00:06<00:00, 5923.00it/s]


In [8]:
relevant_columns = ["video_id", "title", "description", "category_id","cateogory_name"]
video_dataset = video_dataset[relevant_columns]

In [10]:
RANDOM_SEED = 40
df_train, df_test = train_test_split(video_dataset.head(500), 
                                     test_size=0.2, 
                                     random_state=RANDOM_SEED)

# Few Shot Learning with SetFit

In [11]:
from setfit import SetFitClassifier

2022/12/14 22:59:01 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.


In [12]:
# takes a sentence-transformers model
clf = SetFitClassifier("paraphrase-MiniLM-L3-v2")

In [13]:
docs = df_train['title'].to_list()
labels = df_train['cateogory_name'].to_list()

In [14]:
clf.fit(docs, labels)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/250 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
df_test['predicted'] = clf.predict(df_test['title'].to_list())

In [16]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(df_test['cateogory_name'],
                      df_test['predicted'])

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2df7a6790>

In [17]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(df_test["cateogory_name"], 
                            df_test["predicted"]))

                      precision    recall  f1-score   support

    Autos & Vehicles       1.00      0.50      0.67         2
              Comedy       0.80      0.67      0.73         6
           Education       1.00      0.25      0.40         4
       Entertainment       0.53      0.71      0.61        24
    Film & Animation       1.00      0.75      0.86         4
      How to & Style       0.67      0.50      0.57         8
               Music       0.86      0.90      0.88        20
     News & Politics       0.67      1.00      0.80         2
      People & Blogs       0.50      0.40      0.44        10
      Pets & Animals       1.00      1.00      1.00         4
Science & Technology       1.00      0.80      0.89         5
              Sports       0.73      0.80      0.76        10
     Travel & Events       1.00      1.00      1.00         1

            accuracy                           0.71       100
           macro avg       0.83      0.71      0.74       100
      

In [18]:
mlflow.end_run()