In [14]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd

In [41]:
data[data['name']== "Barack Obama"]

Unnamed: 0,URI,name,text,cleaned_text
35811,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,barack hussein obama ii brk husen bm born augu...


In [10]:
# Download necessary NLTK resources

nltk.download('punkt')        # For tokenizers
nltk.download('stopwords')    # For stopwords
nltk.download('wordnet')      # For lemmatization
nltk.download('averaged_perceptron_tagger')  # For part-of-speech tagging

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maryamsadeghi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maryamsadeghi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/maryamsadeghi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/maryamsadeghi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/maryamsadeghi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### 1. Preprocessing the Data 

In [11]:
# Preprocess function
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    
    # Stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    

    return " ".join(tokens)
    

In [12]:
sample_text = "This is a test function for preprocessing."
try:
    result = preprocess_text(sample_text)
    print("Preprocessed text:", result)
  
except Exception as e:
    print("Error in preprocess_text:", str(e))

Preprocessed text: test function preprocess


In [15]:
# Load data
data = pd.read_csv("people_wiki.csv")  # Adjust to your data file
# Fill missing values with an empty string
data['text'] = data['text'].fillna('')

In [16]:


# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Save cleaned data
data.to_csv("cleaned_wikipedia_data.csv", index=False)

In [18]:
sample_text = "This is a test sentencee for preprocessing."
try:
    result = preprocess_text(sample_text)
    print("Preprocessed text:", result)
except Exception as e:
    print("Error in preprocess_text:", str(e))

Preprocessed text: test sentence preprocess


## 2. Training the Model 
 In this step, we vectorize the preprocessed text using TF-IDF(numerical representations). and then train a nearest-neighbor model to perform document retrieval.

In [23]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading scipy-1.10.1-cp38-cp38-macosx_12_0_arm64.whl.metadata (53 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.10.1-cp38-cp38-macosx_12_0_arm64.whl (28.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.3.2 scipy-1.10.1 threadpoolctl-3.5.0


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import pickle
import sys

In [25]:
# Load cleaned data
data = pd.read_csv("cleaned_wikipedia_data.csv")
# TF-IDF Vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['cleaned_text'])

In [26]:
# Save the TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

In [27]:
# Train Nearest Neighbor Model
model = NearestNeighbors(n_neighbors=5, metric='cosine')
model.fit(tfidf_matrix)
#n_neighbors=5: We specify that we want to retrieve the 5 most similar documents for any given query.
#metric='cosine': We use cosine similarity as the distance metric for measuring the similarity between documents.

In [28]:
# Save the model
with open("nearest_neighbor_model.pkl", "wb") as f:
    pickle.dump(model, f)

## 3. Retrieving Documents 
This step allows us to retrieve the most similar documents based on a user’s query.

In [30]:
# Load the trained model and TF-IDF vectorizer
with open("nearest_neighbor_model.pkl", "rb") as f:
    model = pickle.load(f)

with open("tfidf_vectorizer.pkl", "rb") as f:
    tfidf = pickle.load(f)

In [33]:
# Retrieve similar documents
def retrieve_similar_documents(query):
    # Preprocess query text (use the same preprocessing function as before)
    query_tfidf = tfidf.transform([query])
    
    # Get similar documents
    distances, indices = model.kneighbors(query_tfidf)
    
    return indices

In [34]:
# Get query from command line
query = sys.argv[1]
indices = retrieve_similar_documents(query)

In [36]:
data[data['name']]

Unnamed: 0,URI,name,text,cleaned_text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,digbi morrel born 10 octob 1979 former austral...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,alfr j lewi aka sandi lewi graduat univers chi...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,harpdog brown singer harmonica player activ ca...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,franz rottenstein born waidmannsfeld lower aus...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,henri krvit born 30 decemb 1974 tallinn better...
...,...,...,...,...
42781,<http://dbpedia.org/resource/Motoaki_Takenouchi>,Motoaki Takenouchi,motoaki takenouchi born july 8 1967 saitama pr...,motoaki takenouchi born juli 8 1967 saitama pr...
42782,<http://dbpedia.org/resource/Alan_Judge_(footb...,"Alan Judge (footballer, born 1960)",alan graham judge born 14 may 1960 is a retire...,alan graham judg born 14 may 1960 retir profes...
42783,<http://dbpedia.org/resource/Eduardo_Lara>,Eduardo Lara,eduardo lara lozano born 4 september 1959 in c...,eduardo lara lozano born 4 septemb 1959 cali c...
42784,<http://dbpedia.org/resource/Tatiana_Faberg%C3...,Tatiana Faberg%C3%A9,tatiana faberg is an author and faberg scholar...,tatiana faberg author faberg scholar switzerla...


In [40]:
# Load the original data to display document name or content
data = pd.read_csv("cleaned_wikipedia_data.csv")

# Print the results
print(f"Top 5 documents similar to the query '{query}':")
for idx in indices[0]:
    print(f"name: {data.iloc[idx]['name']}")

Top 5 documents similar to the query '--f=/Users/maryamsadeghi/Library/Jupyter/runtime/kernel-v3e362612b1c7d06b8ff88d1113df22d7896e6557c.json':
name: Andrew Morton (computer programmer)
name: Stephen Tweedie
name: Peter Reardon
name: George H. Goble
name: Jake Woods


# Apply nearest neighbors for retrieval of Wikipedia articles

### build the Knn model

In [63]:
knn_model=turicreate.nearest_neighbors.create(people,features=['tfidf'], label='name')

### Use model for retrieval... for example, who is closest to Obama?

In [64]:
knn_model.query(obama)

query_label,reference_label,distance,rank
0,Barack Obama,0.0,1
0,Joe Biden,0.7941176470588236,2
0,Joe Lieberman,0.7946859903381642,3
0,Kelly Ayotte,0.8119891008174387,4
0,Bill Clinton,0.8138528138528138,5


### Who is closest to David Beckham?

In [65]:
knn_model.query(beckham)

query_label,reference_label,distance,rank
0,David Beckham,0.0,1
0,Steven Gerrard,0.7416107382550335,2
0,Didier Drogba,0.7477477477477478,3
0,Gordon Strachan,0.75,4
0,Wayne Rooney,0.7523510971786834,5


### other examples

In [67]:
swift= people[people['name']=='Taylor Swift']
swift

URI,name,text,word_count
<http://dbpedia.org/resou rce/Taylor_Swift> ...,Taylor Swift,taylor alison swift born december 13 1989 is an ...,"{'antidiscrimination': 1.0, 'literacy': 1.0, ..."

tfidf
"{'antidiscrimination': 8.042056410058754, ..."


In [96]:
# Who is closest to Taylor Swift
knn_model.query(swift)

query_label,reference_label,distance,rank
0,Taylor Swift,0.0,1
0,Carrie Underwood,0.7623188405797101,2
0,Alicia Keys,0.7647058823529411,3
0,Jordin Sparks,0.7696335078534031,4
0,Leona Lewis,0.7761194029850746,5


In [97]:
# who is closet to Angelina Jolie
jolie = people[people['name']=="Angelina Jolie"]

In [72]:
knn_model.query(jolie)

query_label,reference_label,distance,rank
0,Angelina Jolie,0.0,1
0,Brad Pitt,0.7840236686390533,2
0,Julianne Moore,0.7958579881656804,3
0,Billy Bob Thornton,0.80306905370844,4
0,George Clooney,0.8046875,5


In [73]:
#Who is closest to Arnold Schwarzenegger

arnold=people[people['name']=='Arnold Schwarzenegger']

In [74]:
knn_model.query(arnold)

query_label,reference_label,distance,rank
0,Arnold Schwarzenegger,0.0,1
0,Jesse Ventura,0.8189189189189189,2
0,John Kitzhaber,0.8246153846153846,3
0,Lincoln Chafee,0.8338762214983714,4
0,Anthony Foxx,0.8339100346020761,5


### Compare top words according to word counts to TF-IDF for 'Elton John'¶


In [99]:
# top five word according to word counts for 'Elton John'
EltonJohn_word_count_table = elton[['word_count']].stack('word_count', new_column_name= ['word', 'count'])
EltonJohn_word_count_table.sort('count', ascending= False).head(5)

word,count
the,27.0
in,18.0
and,15.0
of,13.0
a,10.0


In [100]:
# top five word according to tfidf for 'Elton John'
EltonJohn_tfidf_table = elton[['tfidf']].stack('tfidf', new_column_name= ['word', 'tfidf'])
EltonJohn_tfidf_table.sort('tfidf', ascending= False).head(5)

word,tfidf
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203
john,13.93931279239831
songwriters,11.25040644703154


# Manually evaluate the distances and Comparing for similarties

In [101]:
victoria = people[people['name'] == 'Victoria Beckham']
paul = people[people['name'] == 'Paul McCartney']

In [103]:
# Cosine distance between 'Elton John' and 'Victoria Beckham'
turicreate.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0])

0.9567006376655429

'Elton John' is much closer to 'Paul McCartney' than he is to 'Victoria Beckham'.
It makes sence, because 'Elton John' and 'Paul McCartney' both have similarities like they bothare singer, songwriter and composer where as 'Elton John' and 'Victoria Beckham' both have just one similarity i.e. they both are singers.