In [1]:
### Load the required libraries

import warnings
warnings.filterwarnings('ignore')

import sqlite3

import numpy as np
import pandas as pd

import requests
import re
from bs4 import BeautifulSoup

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm import tqdm, tqdm_notebook
from sentence_transformers import SentenceTransformer, util

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## **Step 1 - Reading the Tables from Database file**

In [2]:
# Read the code below and write your observation in the next cell

conn = sqlite3.connect('eng_subtitles_database.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

<sqlite3.Cursor at 0x221b2de23b0>

[('zipfiles',)]


## **Step 2 - Reading the columns of Table**

In [3]:
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

<sqlite3.Cursor at 0x221b2de23b0>

num
name
content


## **Step 3 - Loading the Database Table inside a Pandas DataFrame**

In [4]:
df1 = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df1.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      82498 non-null  int64 
 1   name     82498 non-null  object
 2   content  82498 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


## **Step 4 - Printing `content` of 0th Row**

In [6]:
b_data = df1.iloc[0, 2]

# here 2 represent the index of content column
# 0 represents the row number

In [7]:
print(b_data)

b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x99V\x9fx\x96\xf0\x8c\x9e\x00\x00\x86\x9b\x01\x00;\x00\x00\x00The.Message.1976.REMASTERED.1080p.BluRay.x264-PiGNUS.EN.srt\xad\xbdm\x93\xdc\xc6\x91.\xfa\x9d\x11\xfc\x0f-}\xe1=\x11-\x9d\x06P\x85\x17\x9d\x8d\xd5%%[\xa4-Y>&u\x15>\xdf\xd0\xd3\x98\x19x\xfae\x0cts<\xfe\xf57\x9f\'\xb3\n\xd9\xa4\xbc\xbb\xf7\xc6Fl\xacELW\xa2\xaa\x90\x95\x95\xafO\x16/_l6\xdf\xe0\xff\xea\xf5f\xb3Y}\xf5\xd5\xbf\xaf\xf4AQ\xae7Mx\xf9\xe2\xd7\xfe|s\xbf\xea\x8f\xcf\xab\x8f\xe3n8\xadN\xc7\xfdx\x1cVO\xe3\xf9~\xf5\xf3\xe3p\xfc\xea\xfd/o>\xbc\xfb\xf0\xe3\xef\xde\xbf|\xf1\xfbi\x18Vo\xa6\xd3\xd3<L\xab\xe1\x1f\xe7\xe18\x8f\xa7\xe37\xab\xd3\xbc\xdb~-\xc3\x1e\xfe\xa7<|\xf9\xe2\xe5\x8bR_[~S\xd6\xeb\xa2k\xf3k\xe5A\xb7\xeeb\xf5\xf2\xc5\xbb\xe3\xea|?\xac\x8e\xfdaX\x9dnW?\x9cvk>8\x9c\xe6\xf3\xean\xeao\xc6\xd3ev\x8f~\x1a\xa6\x9b\xf1\xf6\xb2\xff\x1a\xe4\xabD\xbe*d\x11\xa5#_U\xeb\xaa\xd9`\xa6\xa7\xc3\xea\xa7\xcb}\x7f8\xf4F\xf9\xa7a\x9e\x87\xe3\x9d\xcc\\\xdf\x07B!\x13\xaa\xd61n<!\xd9\xaf\xd0\

## **Step 5 - Unzipping the content of 385th row and decoding using `latin-1`**

In [8]:
import zipfile
import io

# Assuming 'content' is the binary data from your database
binary_data = df1.iloc[385, 2]

# Decompress the binary data using the zipfile module
with io.BytesIO(binary_data) as f:
    with zipfile.ZipFile(f, 'r') as zip_file:
        # Reading only one file in the ZIP archive
        subtitle_content = zip_file.read(zip_file.namelist()[0])

# Now 'subtitle_content' should contain the extracted subtitle content
print(subtitle_content.decode('latin-1'))  # Assuming the content is latin-1 encoded text

1
00:00:06,000 --> 00:00:12,074
Watch any video online with Open-SUBTITLES
Free Browser extension: osdb.link/ext

2
00:00:15,370 --> 00:00:16,506
You lose everything, my girl.

3
00:00:16,530 --> 00:00:19,360
So you've said - four times.

4
00:00:20,330 --> 00:00:22,120
I definitely had
it on yesterday.

5
00:00:22,465 --> 00:00:25,785
Your gloves, your keys, that
handkerchief I embroidered for you

6
00:00:25,809 --> 00:00:26,168
Everything!

7
00:00:26,192 --> 00:00:27,280
Five times.

8
00:00:31,610 --> 00:00:32,920
Miss Scarlet?
- Yes.

9
00:00:36,390 --> 00:00:37,390
I'm Miss Scarlet.

10
00:00:37,872 --> 00:00:40,880
May I inquire if
you've lost something?

11
00:00:41,350 --> 00:00:42,530
Some jewellery perhaps?

12
00:00:42,870 --> 00:00:45,130
Yes, my mother's wedding ring.

13
00:00:45,220 --> 00:00:45,840
Have you found it?

14
00:00:45,950 --> 00:00:47,656
Does your ring have
an inscription?

15
00:00:48,650 -->

## **Step 6 - Applying the above Function on the Entire Data**

In [9]:
tqdm.pandas()

In [10]:
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])
    
    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')  # Assuming the content is UTF-8 encoded text

In [11]:
df1['file_content'] = df1['content'].progress_apply(decode_method)

100%|██████████████████████████████████████████████████████████| 82498/82498 [02:08<00:00, 641.58it/s]


In [12]:
df1.head()

Unnamed: 0,num,name,content,file_content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther..."
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'..."
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."


In [13]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   num           82498 non-null  int64 
 1   name          82498 non-null  object
 2   content       82498 non-null  object
 3   file_content  82498 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.5+ MB


In [14]:
df = df1.sample(frac = 0.3, random_state = 42)

In [15]:
df

Unnamed: 0,num,name,content,file_content
17262,9251120,maybe.this.time.(2014).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x89\x9a\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\x03\x04\x14\x00\x00\x00\x08\x007\x8f\x99V...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ..."
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x8f\x19\x...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin..."
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00[\xaa\x99V...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea..."
54266,9408707,battlebots.(2015).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xf4<\x9aV...,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri..."
...,...,...,...,...
67460,9458807,kevin.can.wait.s01.e13.ring.worm.(2017).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xebP\x9aV...,ï»¿[Script Info]\r\nTitle: Default file\r\nScr...
15296,9244890,bia.s01.e29.episode.1.29.(2019).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00:\x99\x99V...,"ï»¿1\r\n00:00:03,440 --> 00:00:06,160\r\n-Wher..."
40242,9345965,heroes.s02.e11.chapter.eleven.powerless.(2007)...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x85\r\x9a...,"ï»¿1\r\n00:00:01,101 --> 00:00:02,865\r\n<i>Pr..."
56391,9417351,hot.in.cleveland.s05.e09.bad.george.clooney.(2...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x891\x9aV...,"ï»¿1\r\n00:00:01,768 --> 00:00:03,168\r\n<i>- ..."


In [16]:
df.iloc[0,3]

'ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch any video online with Open-SUBTITLES\r\nFree Browser extension: osdb.link/ext\r\n\r\n2\r\n00:00:37,328 --> 00:00:39,706\r\n<i>It could\'ve been\r\njust another summer.</i>\r\n\r\n3\r\n00:00:40,790 --> 00:00:43,042\r\n<i>But as I set foot on the sand,</i>\r\n\r\n4\r\n00:00:43,209 --> 00:00:46,212\r\n<i>that summer\r\nsuddenly felt different.</i>\r\n\r\n5\r\n00:00:55,221 --> 00:00:56,973\r\n<i>Like it was going to be the summer</i>\r\n\r\n6\r\n00:00:57,098 --> 00:00:59,142\r\n<i>that would change my life.</i>\r\n\r\n7\r\n00:00:59,350 --> 00:01:01,770\r\n<i>The summer of freedom.</i>\r\n\r\n8\r\n00:01:02,562 --> 00:01:05,607\r\n<i>The summer of\r\nendless possibilities.</i>\r\n\r\n9\r\n00:01:06,274 --> 00:01:09,402\r\n<i>The summer of 2007.</i>\r\n\r\n10\r\n00:01:16,493 --> 00:01:18,036\r\nOoh, aah!\r\n\r\n11\r\n00:01:24,459 --> 00:01:26,169\r\nOoh, oh!\r\n\r\n12\r\n00:01:26,377 --> 00:01:28,254\r\n<i>â\x99ª Oh, oh, ooh â\x99ª</i>\r\n\r\n13\

In [17]:
# Function to remove special characters, noise, and line numbers from the text data
def clean_text(text):
    # remove timestamps from subtitle documents
    cleaned_text = re.sub(r'\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+', '', text)
    # Remove line numbers
    cleaned_text = re.sub(r'\d+\s*', '', text)
    # Remove HTML tags
    cleaned_text = BeautifulSoup(cleaned_text, "html.parser").get_text(separator=" ")
    # Remove special characters, punctuation, and symbols
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    cleaned_text = re.sub(r'[ï]', '', cleaned_text)
    cleaned_text = re.sub(r'[âª]', '', cleaned_text)
    # Remove extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # change text to lower case
    # cleaned_text = cleaned_text.lower()
    return cleaned_text.strip()

df['file_content'] = df['file_content'].progress_apply(clean_text)

100%|███████████████████████████████████████████████████████████| 24749/24749 [07:56<00:00, 51.94it/s]


In [18]:
df.iloc[0,3]

'Watch any video online with OpenSUBTITLES Free Browser extension osdblinkext It couldve been just another summer But as I set foot on the sand that summer suddenly felt different Like it was going to be the summer that would change my life The summer of freedom The summer of endless possibilities The summer of Ooh aah Ooh oh Oh oh ooh That was the summer of you and me Youre quite the dancer Why did you stop Come on Keep dancing Whatever Im kidding Dont get mad Huh What Hey Im just going to get my towel What Stop that You thought I was gonna kiss you No Excuse me I wanna kiss you but not just yet What do you mean not yet Only when youre my girl What do you mean your girl My girlfriend Miss As if You wish And dont call me miss Dont pretend to be a gentleman when youre clearly not So what should I call you Rude Snob Bitch And you Douche Handsome Conceited Just like you Huh Jerk Exactly your type Leave me alone Steph Aha Steph Ill just call you Tep Remove the S and the F By the way Im Ton

In [19]:
df.head()

Unnamed: 0,num,name,content,file_content
17262,9251120,maybe.this.time.(2014).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x89\x9a\x...,Watch any video online with OpenSUBTITLES Free...
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\x03\x04\x14\x00\x00\x00\x08\x007\x8f\x99V...,Oh I know that its getting late but I dont wan...
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x8f\x19\x...,Timing and Subtitles by The Uncontrollable Lov...
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00[\xaa\x99V...,ethereal music apiOpenSubtitlesorg is deprecat...
54266,9408707,battlebots.(2015).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xf4<\x9aV...,Chris Oh no not the Minibots yelling Oh You le...


In [20]:
df = df.drop('content',axis=1)

In [21]:
df.head()

Unnamed: 0,num,name,file_content
17262,9251120,maybe.this.time.(2014).eng.1cd,Watch any video online with OpenSUBTITLES Free...
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,Oh I know that its getting late but I dont wan...
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,Timing and Subtitles by The Uncontrollable Lov...
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,ethereal music apiOpenSubtitlesorg is deprecat...
54266,9408707,battlebots.(2015).eng.1cd,Chris Oh no not the Minibots yelling Oh You le...


In [22]:
def chunk_text(text, chunk_size=500, overlap_size=50):
    if chunk_size <= 0 or overlap_size < 0:
        raise ValueError("Chunk size must be positive, and overlap size must be non-negative.")

    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap_size

    return chunks

df['chunk_content'] = df['file_content'].progress_apply(chunk_text)

100%|█████████████████████████████████████████████████████████| 24749/24749 [00:09<00:00, 2522.01it/s]


In [23]:
df.head()

Unnamed: 0,num,name,file_content,chunk_content
17262,9251120,maybe.this.time.(2014).eng.1cd,Watch any video online with OpenSUBTITLES Free...,[Watch any video online with OpenSUBTITLES Fre...
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,Oh I know that its getting late but I dont wan...,[Oh I know that its getting late but I dont wa...
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,Timing and Subtitles by The Uncontrollable Lov...,[Timing and Subtitles by The Uncontrollable Lo...
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,ethereal music apiOpenSubtitlesorg is deprecat...,[ethereal music apiOpenSubtitlesorg is depreca...
54266,9408707,battlebots.(2015).eng.1cd,Chris Oh no not the Minibots yelling Oh You le...,[Chris Oh no not the Minibots yelling Oh You l...


In [24]:
df.iloc[0,3]

['Watch any video online with OpenSUBTITLES Free Browser extension osdblinkext It couldve been just another summer But as I set foot on the sand that summer suddenly felt different Like it was going to be the summer that would change my life The summer of freedom The summer of endless possibilities The summer of Ooh aah Ooh oh Oh oh ooh That was the summer of you and me Youre quite the dancer Why did you stop Come on Keep dancing Whatever Im kidding Dont get mad Huh What Hey Im just going to get m',
 'g Dont get mad Huh What Hey Im just going to get my towel What Stop that You thought I was gonna kiss you No Excuse me I wanna kiss you but not just yet What do you mean not yet Only when youre my girl What do you mean your girl My girlfriend Miss As if You wish And dont call me miss Dont pretend to be a gentleman when youre clearly not So what should I call you Rude Snob Bitch And you Douche Handsome Conceited Just like you Huh Jerk Exactly your type Leave me alone Steph Aha Steph Ill ju

In [25]:
df.to_csv('search_engine.csv')

In [26]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
df['doc_vec_pretrained_bert'] = df.chunk_content.progress_apply(model.encode)

  1%|▎                                                         | 132/24749 [20:45<67:42:55,  9.90s/it]

In [None]:
df.head()

In [None]:
df['doc_vector_pretrained_bert'][0].shape

In [None]:
len(df['doc_vector_pretrained_bert'][0])

In [None]:
df.to_csv('se.csv')