Code based on https://blog.devgenius.io/big-data-processing-with-hadoop-and-spark-in-python-on-colab-bff24d85782f

# Java Installation

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
#install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
#create java home variable
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# HADOOP

In [4]:
#download HADOOP
!wget https://archive.apache.org/dist/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz

--2024-03-21 01:52:42--  https://archive.apache.org/dist/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 500749234 (478M) [application/x-gzip]
Saving to: ‘hadoop-3.3.0.tar.gz’


2024-03-21 01:53:05 (21.8 MB/s) - ‘hadoop-3.3.0.tar.gz’ saved [500749234/500749234]



In [5]:
#decompress the Hadoop tar file
!tar -xzvf hadoop-3.3.0.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
hadoop-3.3.0/share/doc/hadoop/hadoop-project-dist/hadoop-common/build/source/hadoop-common-project/hadoop-common/target/api/org/apache/hadoop/fs/FSDataOutputStream.html
hadoop-3.3.0/share/doc/hadoop/hadoop-project-dist/hadoop-common/build/source/hadoop-common-project/hadoop-common/target/api/org/apache/hadoop/fs/TrashPolicyDefault.Emptier.html
hadoop-3.3.0/share/doc/hadoop/hadoop-project-dist/hadoop-common/build/source/hadoop-common-project/hadoop-common/target/api/org/apache/hadoop/fs/HarFileSystem.html
hadoop-3.3.0/share/doc/hadoop/hadoop-project-dist/hadoop-common/build/source/hadoop-common-project/hadoop-common/target/api/org/apache/hadoop/fs/PathExistsException.html
hadoop-3.3.0/share/doc/hadoop/hadoop-project-dist/hadoop-common/build/source/hadoop-common-project/hadoop-common/target/api/org/apache/hadoop/fs/XAttrSetFlag.html
hadoop-3.3.0/share/doc/hadoop/hadoop-project-dist/hadoop-common/build/source/hadoop-common-p

In [6]:
#copy Hadoop directory to user/local
!cp -r hadoop-3.3.0/ /usr/local/

In [7]:
#find the default Java path
!readlink -f /usr/bin/java | sed "s:bin/java::"

/usr/lib/jvm/java-11-openjdk-amd64/


In [8]:
#run Hadoop from /usr/local
!/usr/local/hadoop-3.3.0/bin/hadoop

Usage: hadoop [OPTIONS] SUBCOMMAND [SUBCOMMAND OPTIONS]
 or    hadoop [OPTIONS] CLASSNAME [CLASSNAME OPTIONS]
  where CLASSNAME is a user-provided Java class

  OPTIONS is none or any of:

buildpaths                       attempt to add class files from build tree
--config dir                     Hadoop config directory
--debug                          turn on shell script debug mode
--help                           usage information
hostnames list[,of,host,names]   hosts to use in slave mode
hosts filename                   list of hosts to use in slave mode
loglevel level                   set the log4j level for this command
workers                          turn on worker mode

  SUBCOMMAND is one of:


    Admin Commands:

daemonlog     get/set the log level for each daemon

    Client Commands:

archive       create a Hadoop archive
checknative   check native Hadoop and compression libraries availability
classpath     prints the class path needed to get the Hadoop jar and the requ

In [9]:
#create input folder (test example)
!mkdir ~/testin

In [10]:
#copy example files to the input folder
!cp /usr/local/hadoop-3.3.0/etc/hadoop/*.xml ~/testin

In [11]:
#check that files have been successfully copied (10 files should appear)
!ls ~/testin

capacity-scheduler.xml	hadoop-policy.xml  hdfs-site.xml    kms-acls.xml  mapred-site.xml
core-site.xml		hdfs-rbf-site.xml  httpfs-site.xml  kms-site.xml  yarn-site.xml


In [12]:
#run the mapreduce example (for sanity check)
!/usr/local/hadoop-3.3.0/bin/hadoop jar /usr/local/hadoop-3.3.0/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.0.jar grep ~/testin ~/testout 'allowed[.]*'

2024-03-21 01:53:45,143 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2024-03-21 01:53:45,270 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2024-03-21 01:53:45,270 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2024-03-21 01:53:45,516 INFO input.FileInputFormat: Total input files to process : 10
2024-03-21 01:53:45,575 INFO mapreduce.JobSubmitter: number of splits:10
2024-03-21 01:53:45,829 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local173320543_0001
2024-03-21 01:53:45,829 INFO mapreduce.JobSubmitter: Executing with tokens: []
2024-03-21 01:53:46,042 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2024-03-21 01:53:46,043 INFO mapreduce.Job: Running job: job_local173320543_0001
2024-03-21 01:53:46,054 INFO mapred.LocalJobRunner: OutputCommitter set in config null
2024-03-21 01:53:46,067 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 2
2024-0

In [13]:
#remove the testout content to reuse the folder for other excercises
!rm -r ~/testout

# Word Frequencies

In [14]:
#get the data in CSV files (preprocessed news data)
!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv

--2024-03-21 01:53:50--  https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1857427 (1.8M) [text/plain]
Saving to: ‘test.csv’


2024-03-21 01:53:50 (29.6 MB/s) - ‘test.csv’ saved [1857427/1857427]



In [15]:
#get the data in CSV files (preprocessed training news data)
!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv

--2024-03-21 01:53:51--  https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29470338 (28M) [text/plain]
Saving to: ‘train.csv’


2024-03-21 01:53:51 (166 MB/s) - ‘train.csv’ saved [29470338/29470338]



# Crawl Wikipedia

In [16]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from collections import deque

# Function to check if word contains only English letters
def is_english_word(word):
    return re.match("^[a-zA-Z]+$", word)

# Function to process text
def process_text(text):
    line = text.strip()
    line = re.sub(r'[^\w\s]', '', line)
    line = line.lower()
    for x in line:
        if x in punctuations:
            line = line.replace(x, "\t")
    words = line.split()
    return [lancaster.stem(word) for word in words if word not in stop_words and is_english_word(word)]

# Function to crawl Wikipedia
def crawl_wikipedia(seed_urls, max_pages, output_file, retrieved_urls):
    queue = deque(seed_urls)
    visited=set()
    # Open CSV file for writing
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        while queue and len(visited) < max_pages:
            if (len(visited) % 50 == 0):
                print(len(visited))
            url = queue.popleft()
            if url in visited or not is_wikipedia_url(url):
                continue

            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            words = process_text(soup.get_text())
            urlIdx = len(retrieved_urls)+1
            for word in words:
                writer.writerow([word, urlIdx])  # Writing data to CSV
            retrieved_urls.append(url)
            visited.add(url)

            # Extract and enqueue hyperlinks
            for link in soup.find_all('a', href=True):
                full_url = 'https://en.wikipedia.org' + link['href']
                if is_wikipedia_url(full_url) and full_url not in visited:
                    queue.append(full_url)

# Example usage
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
lancaster = LancasterStemmer()
def is_wikipedia_url(url):
    return url.startswith('https://en.wikipedia.org/wiki/')
retrieved_urls = []
seed_urls = ['https://en.wikipedia.org/wiki/Main_Page']
crawl_wikipedia(seed_urls, max_pages=100, output_file='wikipedia_crawl.csv', retrieved_urls=retrieved_urls)

0
50


In [17]:
#For you to do: Upload the files (left menu) for mapper and reducer.
#Then run these two lines to change their modes to execution
!chmod u+rwx /content/mapper.py
!chmod u+rwx /content/reducer.py

In [18]:
#Run hadoop to execute the mapper and reducer using the text.csv file and
!/usr/local/hadoop-3.3.0/bin/hadoop jar /usr/local/hadoop-3.3.0/share/hadoop/tools/lib/hadoop-streaming-3.3.0.jar -input /content/train.csv -output ~/testout -file /content/mapper.py  -file /content/reducer.py  -mapper 'python mapper.py'  -reducer 'python reducer.py'

2024-03-21 01:55:08,789 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [/content/mapper.py, /content/reducer.py] [] /tmp/streamjob7039751902503697169.jar tmpDir=null
2024-03-21 01:55:09,602 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2024-03-21 01:55:09,717 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2024-03-21 01:55:09,717 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2024-03-21 01:55:09,735 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2024-03-21 01:55:09,934 INFO mapred.FileInputFormat: Total input files to process : 1
2024-03-21 01:55:09,964 INFO mapreduce.JobSubmitter: number of splits:1
2024-03-21 01:55:10,184 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local1269713467_0001
2024-03-21 01:55:10,184 INFO mapreduce.JobSubmitter: Executing with tokens: []
2024-03-21 01:55:10,531 INFO mapred.Loc

## TF-IDF

In [19]:
import pandas as pd
import numpy as np
import scipy as sp
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

nltk.download('stopwords')
nltk.download('punkt')
lancaster = LancasterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [20]:
vocab_size = 200
# read, parse
df_vocab_raw = pd.read_csv('~/testout/part-00000', delimiter='\t', header=None)
df_vocab_raw.columns = ['word', 'posting']
# aggregate
df_vocab_raw.iloc[:,1] = df_vocab_raw.iloc[:,1].apply(json.loads)
df_vocab_raw['freq'] = df_vocab_raw['posting'].apply(lambda x: sum(x.values()))
# sort, trim
df_vocab = df_vocab_raw.sort_values(by='freq', ascending=False)
df_vocab = df_vocab.reset_index(drop=True)
df_vocab = df_vocab.iloc[:vocab_size]
df_vocab['rank'] = range(vocab_size)
df_vocab = df_vocab.set_index('word')
# idf
docs = set()
for row in df_vocab.loc[:,'posting']:
  docs.update(row.keys())
M = len(docs)
def cal_idf(posting):
  doc_freq = len(posting)
  return np.log((M+1) / doc_freq)
df_vocab['idf'] = df_vocab['posting'].apply(cal_idf)

In [21]:
df_vocab

Unnamed: 0_level_0,posting,freq,rank,idf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39s,"{'103572': 1, '49173': 2, '110895': 1, '58047'...",13574,0,2.124885
new,"{'67548': 1, '82786': 1, '29673': 1, '82194': ...",11821,1,2.203052
us,"{'61968': 1, '90683': 1, '108473': 1, '75034':...",10539,2,2.312900
reut,"{'72923': 1, '105999': 1, '72924': 1, '41489':...",10100,3,2.293386
ap,"{'50766': 1, '116782': 1, '28760': 1, '37608':...",8546,4,2.468028
...,...,...,...,...
los,"{'54139': 1, '104674': 1, '68277': 1, '46943':...",1086,195,4.558147
giv,"{'80365': 1, '40869': 1, '62460': 1, '53334': ...",1082,196,4.532710
ath,"{'3883': 2, '1549': 1, '8338': 1, '1563': 1, '...",1077,197,4.621160
mus,"{'89277': 1, '105532': 1, '13076': 1, '92033':...",1064,198,4.661601


In [22]:
stop_words = set(stopwords.words('english'))
def txt2Vec(text):
  text = text.strip()
  # Remove punctuation and numbers
  text = re.sub(r'[^\w\s]', '', text)
  text = text.lower()
  for word in text:
    if word in punctuations:
      text = text.replace(word, "\t")
  words = text.split()
  words = [lancaster.stem(word) for word in words]
  words_freq = Counter(words)
  # Create a bit vector for this document
  freq_vector = np.zeros(len(df_vocab), dtype=np.uint8)
  k = 100
  for word in words:
    if word in df_vocab.index:
      idx = df_vocab.loc[word, 'rank']
      freq_vector[idx] = words_freq[word]
  return freq_vector

In [23]:
# read
df_train = pd.read_csv('train.csv', header=None)
df_train.columns = ['label', 'title', 'txt']
null_row = pd.DataFrame({'label':[-1], 'title':['NULL_XYZ'], 'txt':['NULL_XYZ']}, index=[0])
df_train = pd.concat([null_row, df_train[:]])
df_train = df_train.reset_index(drop=True)

# count
num_docs = len(df_train)
output = np.zeros((num_docs, vocab_size))
for word, row in df_vocab.iterrows():
  idx = row['rank']
  for doc, count in row['posting'].items():
    docid = int(doc)
    count = int(count)
    output[docid,idx] += count
df_train['vec'] = list(output)

In [24]:
df_train

Unnamed: 0,label,title,txt,vec
0,-1,NULL_XYZ,NULL_XYZ,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
119996,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
119997,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
119998,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
119999,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Vector Space Model

In [25]:
queries = ['olympic gold athens', 'reuters stocks friday', 'investment market prices']
def Query_VSM(q_vec, d_vecs, df, n):
  similarity = np.dot(d_vecs, q_vec)
  if n > 0:
    n_indices = np.argsort(similarity)[-n:][::-1]
  else:
    n_indices = np.argsort(similarity)[:-n][::-1]
  for rank, i in enumerate(n_indices):
    print(f'Rank{rank} Score{similarity[i]}')
    print(f'{df.iloc[i, 2]}')
    print()
  print()
  return n_indices

collection = np.vstack(df_train['vec'])
# length normalization
b = 0
doc_len = np.sum(collection, axis=1)
AVG_LEN = np.mean(doc_len)
normalizer = 1 - b + b * doc_len / AVG_LEN
normalizer = normalizer.reshape(-1,1)
#idf
idf = np.array(df_vocab['idf'])
idf = idf.reshape(1,-1)
# bm25
k = 1.5
docs_bm25 = (k + 1) * collection / (collection + k * normalizer) * idf


for q_txt in queries:
  q_vec = txt2Vec(q_txt)
  print(f'Query: {q_txt}')
  Query_VSM(q_vec, docs_bm25, df_train, 10)

Query: olympic gold athens
Rank0 Score12.8950279131081
ATHENS (Reuters) - Greek Olympic 200 metres champion Costas Kenteris and his fellow Greek Olympic silver medallist Katerina Thanou say they are withdrawing from the Athens Games. 

Rank1 Score12.8950279131081
ATHENS  Four-time judo world champion Noriko Anno broke her Olympic jinx by winning gold in the women #39;s 78-kilogram event Thursday at the Athens Games but defending Olympic champion Kosei Inoue was dealt shock defeats to miss out on a medal in the ...

Rank2 Score12.8950279131081
ATHENS - Jamaican Veronica Campbell became the first Jamaican and Caribbean woman ever to win an Olympic sprint gold when she captured the 200 metres at the 2004 Athens Olympics yesterday.

Rank3 Score11.00701649265441
ATHENS : New Zealand #39;s Sarah Ulmer smashed her second world record in as many days on the way to claiming the 3000-metre individual pursuit cycling gold medal at the Athens Olympics.

Rank4 Score11.00701649265441
 ATHENS (Reuter

# Probabilistic Language Model

In [26]:
collection = np.vstack(df_train['vec'], dtype=np.float64)
C = np.sum(collection)
lamb = 0.1
bg = np.array(df_vocab['freq']).reshape(1,-1) / C

def Query_LM(q_vec, d_vecs, df, n):
  d_length = np.sum(d_vecs, axis = 1).reshape(-1,1)
  d_normalized = np.divide(d_vecs, d_length, out=np.zeros_like(d_vecs), where=d_length!=0)
  d_normalized /= bg
  d_lm = np.log(1 + (1-lamb)/lamb * d_normalized)
  likelihood = np.dot(d_lm, q_vec)
  if n > 0:
    n_indices = np.argsort(likelihood)[-n:][::-1]
  else:
    n_indices = np.argsort(likelihood)[:-n][::-1]
  for rank, i in enumerate(n_indices):
    print(f'Rank{rank} Score{likelihood[i]}')
    print(f'{df.iloc[i, 2]}')
    print()
  print()
  return n_indices

for q_txt in queries:
  q_vec = txt2Vec(q_txt)
  print(f'Query: {q_txt}')
  Query_LM(q_vec, collection, df_train, 10)

Query: olympic gold athens
Rank0 Score14.831376291949706
There will be many lingering memories from these Athens Olympics, not least of which will be every detail of the in-venue catering, a tawdry little collection of the stewed, the strange and 

Rank1 Score14.831376291949706
AUSTRALIANS went into a television-buying frenzy the run-up to the Athens Olympics, suggesting that as a nation we could easily have scored a gold medal for TV purchasing.

Rank2 Score14.831376291949706
ATHENS  As the delay-plagued build-up to the Olympics illustrated, the Greeks aren #39;t exactly known for organization and punctuality. But now that stadiums, new metro lines, beautified city squares and a vast ...

Rank3 Score14.831376291949706
Costs for the Athens Olympics are climbing again, expected to top \$8.5 billion US because of the massive security and overruns in the last-minute scramble to get venues ready, a government official said yesterday. 

Rank4 Score14.831376291949706
Waking up on the morning

# Cross Validation

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from tqdm import tqdm

## Approach: Use documents as queries, get the accuracy of retrieving themselves as the highest rank. Idea: A document should have the highest relevance against itself. Procedure: For each document, get the highest relevant document, check if is itself, accumulate accuracy/

In [28]:
collection = np.vstack(df_train['vec'])
doc_len = np.sum(collection, axis=1)
AVG_LEN = np.mean(doc_len)

def chunk_to_sparse(chunk_size, X):
  X = X.reshape(-1, vocab_size)
  num_chunks = np.ceil(X.shape[0] / chunk_size).astype(int)
  X_chunks = [X[i * chunk_size:(i+1)*chunk_size, :] for i in range(num_chunks)]
  X_sparse = [sp.sparse.csr_matrix(X_chunk) for X_chunk in X_chunks]
  return X_sparse

class BM25VSM(BaseEstimator):
  def __init__(self, D, y, k=1.5):
    self.k = k
    self.b = 0.75
    self.chunk_size = 4096
    self.D = D
    doc_len = np.sum(self.D, axis=1)
    doc_len = doc_len.reshape(-1, 1)
    self.normalizer = 1 - self.b + self.b * doc_len / AVG_LEN
    self.normalizer = self.normalizer.reshape(-1, 1)
    self.idf = np.array(df_vocab['idf']).reshape(1, vocab_size)
    #X_transformed = (self.k + 1) * X / (X + self.k * normalizer) * idf
    self.y = y
  def fit(self, X, y):
    self.D = self.D.reshape(-1,vocab_size)
    D_bm25 = (self.k + 1) * self.D / (self.D + self.k * self.normalizer) * self.idf
    self.D_chunks = chunk_to_sparse(self.chunk_size, D_bm25)
    return self
  def predict(self, Q):
    Q = Q.reshape(-1, vocab_size)
    Q_chunks = chunk_to_sparse(self.chunk_size, Q)
    y_h = []
    for Q_chunk in tqdm(Q_chunks):
      chunk_result = np.zeros((self.chunk_size, len(self.D_chunks) * self.chunk_size))
      for i,D_chunk in enumerate(self.D_chunks):
          chunk_result[:Q_chunk.shape[0],i*self.chunk_size:i*self.chunk_size+D_chunk.shape[0]] += Q_chunk.dot(D_chunk.T).toarray()
      y_h.append(np.argmax(chunk_result, axis=1))
    y_h = np.hstack(y_h)
    y_h = y_h[:Q.shape[0]]
    return y_h
  def score(self, X, y):
    y_h = self.predict(X)
    y_diff = np.count_nonzero(y_h - y)
    return 1-y_diff / y.shape[0]
X = collection
y = np.arange(X.shape[0])
vsm = BM25VSM(X, y)

# Result, metric is accuracy

In [29]:
X = collection
y = np.arange(X.shape[0])
vsm = BM25VSM(X, y, 1.0)
vsm = vsm.fit(X, y)
#scores = cross_val_score(vsm, X, y, cv=5)
#print(scores)
vsm.score(X, y)

100%|██████████| 30/30 [07:02<00:00, 14.09s/it]


0.5737202189981749

In [30]:
param_grid = {'k': np.linspace(0.5,1.5,5)}

# Create the GridSearchCV object
grid_search = GridSearchCV(vsm, param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(X, y)

100%|██████████| 6/6 [01:18<00:00, 13.13s/it]
100%|██████████| 6/6 [01:19<00:00, 13.27s/it]
100%|██████████| 6/6 [01:19<00:00, 13.18s/it]
100%|██████████| 6/6 [01:17<00:00, 12.99s/it]
100%|██████████| 6/6 [01:19<00:00, 13.29s/it]
100%|██████████| 6/6 [01:18<00:00, 13.09s/it]
100%|██████████| 6/6 [01:27<00:00, 14.59s/it]
100%|██████████| 6/6 [01:18<00:00, 13.09s/it]
100%|██████████| 6/6 [01:18<00:00, 13.11s/it]
100%|██████████| 6/6 [01:16<00:00, 12.75s/it]
100%|██████████| 6/6 [01:19<00:00, 13.24s/it]
100%|██████████| 6/6 [01:16<00:00, 12.80s/it]
100%|██████████| 6/6 [01:18<00:00, 13.03s/it]
100%|██████████| 6/6 [01:28<00:00, 14.82s/it]
100%|██████████| 6/6 [01:19<00:00, 13.26s/it]
100%|██████████| 6/6 [01:17<00:00, 12.90s/it]
100%|██████████| 6/6 [01:18<00:00, 13.01s/it]
100%|██████████| 6/6 [01:20<00:00, 13.34s/it]
100%|██████████| 6/6 [01:19<00:00, 13.25s/it]
100%|██████████| 6/6 [01:18<00:00, 13.02s/it]
100%|██████████| 6/6 [01:21<00:00, 13.67s/it]
100%|██████████| 6/6 [01:30<00:00,

In [32]:
grid_search.best_params_

{'k': 0.5}