In [1]:
!pip install python-pptx python-docx SpeechRecognition



In [2]:
!pip install pymongo PyPDF2 moviepy requests keybert transformers git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-ksj8g9uf
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-ksj8g9uf
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import sys
sys.path.append("/content/drive/MyDrive/diploma/modules/")

In [6]:
import extractor
import annotation
from digital_footprint import DigitalFootprintType, DigitalFootprintDataType, create_df_from_object
from extractor import Extractor
from pipeline import Pipeline
from storage import MongoDBStorage
from service import MongoDBSearchService

In [7]:
# Configuration
base_folder = "temp/"

In [8]:
import os
if not os.path.exists(base_folder):
  os.mkdir(base_folder)

In [9]:
df_extractor = Extractor(base_folder)
# df_extractor = Extractor(base_folder, speech_recognition_model="medium")

extraction_methods = {
        DigitalFootprintType.DOCUMENT.value: extractor.extract_text_from_document,
        DigitalFootprintType.VIDEO.value: extractor.extract_audio_from_video,
        DigitalFootprintType.AUDIO.value: df_extractor.extract_text_from_audio
    }

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
annotator = annotation.Annotator()

annotation_methods = {
        DigitalFootprintDataType.TEXT.value: [
            annotator.get_named_entities_from_text,
            annotator.get_keywords_from_text,
            annotator.get_topics_from_text,
            annotator.get_summary_from_text,
        ]
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



In [12]:
with open("/content/drive/MyDrive/diploma/modules/db_host", "r") as file:
    host = file.read().rstrip()

storage = MongoDBStorage(parameters={
        "host_url": host
    })


In [13]:
# Initializing pipeline with given configuration
my_pipeline = Pipeline(df_extractor,
                           extraction_methods,
                           annotation_methods,
                           storage,
                           MongoDBSearchService(storage))

In [17]:
df_list = []
not_annotated_df = storage.find_not_annotated()
for df in not_annotated_df:
  real_df = create_df_from_object(df)
  print(f"ID of DF is {df['_id']}")
  df_list.append(real_df)
print(f"Length of df to annotate: {len(df_list)}")

ID of DF is 661931f5d172007aa8b18204
ID of DF is 661931fcd172007aa8b18205
ID of DF is 661931ffd172007aa8b18206
Length of df to annotate: 3


In [18]:
annotated_df = my_pipeline.process_data(df_list, save_immediately=True)
print(annotated_df)

Extracting token from digital footprint object...
Downloading content of digital footprint by link https://www.googleapis.com/drive/v3/files/15jaagrIA5i8vq19TSsbO21acoTWablnh?alt=media
MoviePy - Writing audio in temp/f2f4aec8d63b4c88aa742b98e31db6b1.wav




MoviePy - Done.
Audio extraction for video - temp/f2f4aec8d63b4c88aa742b98e31db6b1 completed in 0 hours, 0 minutes, 24 seconds
Text extraction for video - temp/f2f4aec8d63b4c88aa742b98e31db6b1 completed in 0 hours, 9 minutes, 43 seconds
Named Entity Recognition for https://www.googleapis.com/drive/v3/files/15jaagrIA5i8vq19TSsbO21acoTWablnh?alt=media completed in 0 hours, 0 minutes, 1 seconds




Keywords extraction for https://www.googleapis.com/drive/v3/files/15jaagrIA5i8vq19TSsbO21acoTWablnh?alt=media completed in 0 hours, 0 minutes, 32 seconds
Topic generation for https://www.googleapis.com/drive/v3/files/15jaagrIA5i8vq19TSsbO21acoTWablnh?alt=media completed in 0 hours, 0 minutes, 0 seconds
Summary generation for https://www.googleapis.com/drive/v3/files/15jaagrIA5i8vq19TSsbO21acoTWablnh?alt=media completed in 0 hours, 2 minutes, 55 seconds
Removing temp/f2f4aec8d63b4c88aa742b98e31db6b1
Removing temp/a0147382d57b49f39fababbf65b9541b
Removing temp/f2f4aec8d63b4c88aa742b98e31db6b1.wav
Removing temp/6b8f78b73842466b9e1b994c4162ef94
Extracting token from digital footprint object...
Downloading content of digital footprint by link https://www.googleapis.com/drive/v3/files/1PvfrmqfsIvHCUXCDmS8ZODBLWmfNdVwR?alt=media
MoviePy - Writing audio in temp/bf9df346834741dc9421fddbd492f208.wav




MoviePy - Done.
Audio extraction for video - temp/bf9df346834741dc9421fddbd492f208 completed in 0 hours, 0 minutes, 18 seconds
Text extraction for video - temp/bf9df346834741dc9421fddbd492f208 completed in 0 hours, 6 minutes, 17 seconds
Named Entity Recognition for https://www.googleapis.com/drive/v3/files/1PvfrmqfsIvHCUXCDmS8ZODBLWmfNdVwR?alt=media completed in 0 hours, 0 minutes, 0 seconds




Keywords extraction for https://www.googleapis.com/drive/v3/files/1PvfrmqfsIvHCUXCDmS8ZODBLWmfNdVwR?alt=media completed in 0 hours, 0 minutes, 18 seconds
Topic generation for https://www.googleapis.com/drive/v3/files/1PvfrmqfsIvHCUXCDmS8ZODBLWmfNdVwR?alt=media completed in 0 hours, 0 minutes, 0 seconds
Summary generation for https://www.googleapis.com/drive/v3/files/1PvfrmqfsIvHCUXCDmS8ZODBLWmfNdVwR?alt=media completed in 0 hours, 1 minutes, 36 seconds
Removing temp/bf9df346834741dc9421fddbd492f208.wav
Removing temp/bf9df346834741dc9421fddbd492f208
Extracting token from digital footprint object...
Downloading content of digital footprint by link https://www.googleapis.com/drive/v3/files/1IoP0fsZHebx1cCErycY3IxcNL5cN3LCO?alt=media
MoviePy - Writing audio in temp/c06d92e642914aac935b529090ef28cd.wav




MoviePy - Done.
Audio extraction for video - temp/c06d92e642914aac935b529090ef28cd completed in 0 hours, 0 minutes, 18 seconds
Text extraction for video - temp/c06d92e642914aac935b529090ef28cd completed in 0 hours, 7 minutes, 54 seconds
Named Entity Recognition for https://www.googleapis.com/drive/v3/files/1IoP0fsZHebx1cCErycY3IxcNL5cN3LCO?alt=media completed in 0 hours, 0 minutes, 0 seconds




Keywords extraction for https://www.googleapis.com/drive/v3/files/1IoP0fsZHebx1cCErycY3IxcNL5cN3LCO?alt=media completed in 0 hours, 0 minutes, 25 seconds
Topic generation for https://www.googleapis.com/drive/v3/files/1IoP0fsZHebx1cCErycY3IxcNL5cN3LCO?alt=media completed in 0 hours, 0 minutes, 0 seconds
Summary generation for https://www.googleapis.com/drive/v3/files/1IoP0fsZHebx1cCErycY3IxcNL5cN3LCO?alt=media completed in 0 hours, 2 minutes, 4 seconds
Removing temp/c06d92e642914aac935b529090ef28cd.wav
Removing temp/c06d92e642914aac935b529090ef28cd
[{'link': 'https://www.googleapis.com/drive/v3/files/15jaagrIA5i8vq19TSsbO21acoTWablnh?alt=media', 'type': 'video', 'created_at': datetime.datetime(2024, 4, 12, 16, 6, 46), 'annotated': True, '_id': '661931f5d172007aa8b18204', 'auth_data': {'type': 'TOKEN', 'token_type': 'Bearer', 'token': 'ya29.a0Ad52N39x_SrePf_Kf0643VIoNEO-DBkOJVLmLaXeOGzBl-l-M-0fD9yE7_Fs9Avad5yOFyXHVWgeljxJi2gFXL7Q6_LB3Xj_4KezBTZg6espRZJwh4x079nnYtK0EjJQgr3rt85b0wxfiPMELLz

In [None]:
# my_pipeline.save_annotated_df(annotated_df)


In [None]:
1

In [None]:
1