In [1]:
import requests
from bs4 import BeautifulSoup

## Get Orgs

In [2]:
links_page = requests.get("https://en.wikipedia.org/wiki/List_of_environmental_organizations")
page_soup = BeautifulSoup(links_page.text)
link_anchors = page_soup.find("div", id="mw-content-text", _class="").find_all("li")
links = [li.find('a')["href"] for li in link_anchors if li.find('a') is not None and "href" in li.find('a').attrs.keys()] 

In [3]:
def remove_after(main_string, sub_string):
    """
    Removes everything in the main_string after the first occurrence of sub_string.

    Parameters:
    main_string (str): The string to process.
    sub_string (str): The substring after which everything will be removed.

    Returns:
    str: The processed string.
    """
    index = main_string.find(sub_string)
    if index != -1:
        return main_string[:index + len(sub_string)]
    return main_string

In [4]:
corpus = []
for link in links:
	if "https:" in link or "File:" in link or "Category:" in link:
		continue
	res = requests.get(f"https://en.wikipedia.org{link}")
	soup = BeautifulSoup(res.text)
	text = soup.find("div", id="mw-content-text").text
	if not text:
		continue
	text = remove_after(text, "References[edit]")
	corpus.append(text)


## Get People

In [5]:
res = requests.get("https://en.wikipedia.org/wiki/Category:American_environmentalists")
american_soup = BeautifulSoup(res.text)
links = american_soup.find("div", id="mw-pages").find_all("a")
links = [link["href"] for link in links]
links

['/wiki/Wikipedia:FAQ/Categorization#Why_might_a_category_list_not_be_up_to_date?',
 '/w/index.php?title=Category:American_environmentalists&pagefrom=Dubois%2C+Mark%0AMark+Dubois#mw-pages',
 '/wiki/Edward_Abbey',
 '/wiki/Wendy_Abrams',
 '/wiki/Sarkis_Acopian',
 '/wiki/Ansel_Adams',
 '/wiki/John_H._Adams_(environmentalist)',
 '/wiki/Gerald_W._Adelmann',
 '/wiki/John_Africa',
 '/wiki/Saul_Alinsky',
 '/wiki/Michael_Allen_(California_politician)',
 '/wiki/Robert_Porter_Allen',
 '/wiki/Susana_Almanza',
 '/wiki/Annie_Aghnaqa_(Akeya)_Alowa',
 '/wiki/Lisa_Alvarez-Cohen',
 '/wiki/Elmer_L._Andersen',
 '/wiki/Gillian_Anderson',
 '/wiki/Harold_C._Anderson',
 '/wiki/Marisa_Anderson',
 '/wiki/Terry_L._Anderson',
 '/wiki/Mary_Appelhof',
 '/wiki/John_S._Apperson',
 '/wiki/David_Archambault_II',
 '/wiki/Roma_Armbrust',
 '/wiki/Bob_Armstrong_(politician)',
 '/wiki/Stanley_Aronowitz',
 '/wiki/Tre_Arrow',
 '/wiki/Shahzeen_Attari',
 '/wiki/Jonathan_Aurthur',
 '/wiki/Damali_ayo',
 '/wiki/Ed_Ayres_(environme

In [6]:
people = []
for link in links:
	if "wiki" not in link or "FAQ" in link or "File:" in link or "Category:" in link:
		continue
	res = requests.get(f"https://en.wikipedia.org{link}")
	soup = BeautifulSoup(res.text)
	bio = soup.find("div", id="mw-content-text").text
	bio = remove_after(bio, "References[edit]")
	people.append(bio)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(stop_words="english")
docs_tfidf = vectorizer.fit_transform(corpus)

def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

In [8]:
import numpy as np

In [9]:
from scipy.sparse import csr_matrix
import pickle

In [10]:
def compress_array(arr):
    """
    Compress a NumPy array that mostly contains zeros using a sparse matrix representation.
    
    Parameters:
        arr (np.ndarray): The input NumPy array.
    
    Returns:
        bytes: The serialized sparse matrix.
    """
    # Convert the dense array to a CSR (Compressed Sparse Row) matrix
    sparse_matrix = csr_matrix(arr)
    
    # Serialize the sparse matrix using pickle
    compressed_data = pickle.dumps(sparse_matrix)
    
    return compressed_data
comp = compress_array(docs_tfidf[0].toarray())
print(len(comp))

3631


In [11]:
max([len(compress_array(a.toarray())) for a in docs_tfidf])

82615

In [12]:
max([len(a) for a in corpus])

207175

In [13]:
def decompress_array(compressed_data):
    """
    Decompress a serialized sparse matrix back to a dense NumPy array.
    
    Parameters:
        compressed_data (bytes): The serialized sparse matrix.
    
    Returns:
        np.ndarray: The decompressed dense NumPy array.
    """
    # Deserialize the sparse matrix using pickle
    sparse_matrix = pickle.loads(compressed_data)
    
    # Convert the sparse matrix back to a dense NumPy array
    dense_array = sparse_matrix.toarray()
    
    return dense_array
decompress_array(comp).shape

(1, 55342)

In [18]:
query = "Hi there! I'm [Your Name], a passionate advocate for global peace and the complete abolition of nuclear weapons. With a background in international relations and a deep commitment to humanitarian values, I have dedicated my career to raising awareness about the catastrophic consequences of nuclear warfare and advocating for a nuclear-free world."
similarity = get_tf_idf_query_similarity(vectorizer, docs_tfidf, query)
index = np.where(similarity == max(similarity))
print(max(similarity))
print(query)
print(corpus[index[0][0]])

0.36889810501472864
Hi there! I'm [Your Name], a passionate advocate for global peace and the complete abolition of nuclear weapons. With a background in international relations and a deep commitment to humanitarian values, I have dedicated my career to raising awareness about the catastrophic consequences of nuclear warfare and advocating for a nuclear-free world.
Social movement


169,000 people attended an anti-nuclear protest in Bonn, West Germany, on 14 October 1979, following the Three Mile Island accident.[1]
Anti-nuclear demonstration in Colmar, north-eastern France, on 3 October 2009
Anti-Nuclear Power Plant Rally following the Fukushima Daiichi nuclear disaster on 19 September 2011 at Meiji Shrine complex in Tokyo, Japan
Anti-nuclear movement
By country

Australia
Austria
Canada
France
Germany
India
Ireland
Japan
Kazakhstan
New Zealand
Philippines
Poland
Russia
South Africa
South Korea
Spain
Sweden
Switzerland
Taiwan
Turkey
United Kingdom
United States
Protests


Lists

Anti-

In [15]:
type(docs_tfidf)

scipy.sparse._csr.csr_matrix

In [17]:
docs_tfidf.shape

(738, 55342)

In [19]:
from scipy.sparse import hstack

In [36]:
def train(corpus: list[str]) -> tuple[csr_matrix, csr_matrix]:
    """
    Fits the vectorizer on the matrix, returning both the vectorizer and tf-idf

    :param corpus: A list of the documents (orgs) to fit against
    :returns: The fitted vectorizer and csr matrix
    """
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform(corpus)
    features = vectorizer.get_feature_names_out()
    return tfidf, features

d, f = train(corpus)


In [37]:
f.shape

(55342,)

In [31]:
hstack(d, f)

ValueError: blocks must be 2-D

In [38]:
d.shape

(738, 55342)

In [42]:
trans = TfidfVectorizer(stop_words="english").transform([query])
similarity = cosine_similarity(trans, docs_tfidf).flatten()

index = np.where(similarity == max(similarity))
print(max(similarity))
print(query)
print(corpus[index[0][0]])

TypeError: TfidfVectorizer.transform() takes 2 positional arguments but 3 were given

In [45]:
new_vec = TfidfVectorizer(stop_words="english", vocabulary=f)

In [47]:
new_vec.idf_ = vectorizer.idf_

In [48]:
new_vec.idf_

array([4.96624059, 1.82455441, 6.91215074, ..., 6.91215074, 6.91215074,
       6.50668563])

In [53]:
f

array(['00', '000', '0000', ..., '馬傑偉', '高等法院', '한국어'], dtype=object)

In [51]:
d.shape

(738, 55342)