Wikipedia Recommendation system

-----

Authors:
- Martyna Stasiak id.156071

The aim of this project is to generate the recommendations on wikipedia articles basing on the ones that user have liked. <br>
To do that we have used 10 000 initial articles that were obtained by web crawling, starting from the https://en.wikipedia.org/wiki/Machine_learning article; later they were saved in the csv file, so if there is a need the file working as our database might be changed.

------

Libraries that we have used and are necessary for this project:

In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

from IPython.core.display import display, HTML

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mmart\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mmart\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mmart\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from IPython.core.display import display, HTML


### Crawling and saving our articles

In this part we create the file that will work as or database containing all possible wikipedia articles. <br>
We perform the crawling by ...... <explain precisely> <br>


In [33]:
def crawlArticles(start_url, max_articles):
    visited = set()
    to_visit = [start_url]
    articles = []
    
    while to_visit and len(articles) < max_articles:
        page = to_visit.pop(0)
        if page in visited:
            continue
        visited.add(page)
        
        try:
            response = requests.get(page)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            title = soup.find('h1').text # article's title
            paragraphs = soup.find_all('p') # article's paragraphs
            content = ' '.join([p.text for p in paragraphs]) # article's content that is inside paragraphs
            articles.append({"title": title, "link": page, "content": content})
            
            # extracting and filtering new links
            for link in soup.find_all('a', href=True): # we look for all links in the page
                href = link['href']
                if href.startswith('/wiki/') and ':' not in href and '#' not in href and 'Main_Page' not in href:
                    full_url = "https://en.wikipedia.org" + href
                    if full_url not in visited:
                        to_visit.append(full_url)
            time.sleep(0.5) # be polite to Wikipedia
        except:
            pass
        
    return articles

In [34]:
articles = crawlArticles("https://en.wikipedia.org/wiki/Machine_learning", 10)
df = pd.DataFrame(articles)

In [35]:
def saveDatabase(df, fileName):
    df.to_csv(fileName, index=False)
    return None

In [36]:
saveDatabase(df, 'articles.csv')

In [37]:
df.head()

Unnamed: 0,title,link,content
0,Machine learning,https://en.wikipedia.org/wiki/Machine_learning,Machine learning (ML) is a field of study in a...
1,Machine Learning (journal),https://en.wikipedia.org/wiki/Machine_Learning...,Machine Learning is a peer-reviewed scientifi...
2,Statistical learning in language acquisition,https://en.wikipedia.org/wiki/Statistical_lear...,Statistical learning is the ability for humans...
3,Data mining,https://en.wikipedia.org/wiki/Data_mining,Data mining is the process of extracting and d...
4,Supervised learning,https://en.wikipedia.org/wiki/Supervised_learning,"In machine learning, supervised learning (SL) ..."


### Text preprocessing

Now when we have the database containing the articles we have to do the preprocessing; <br>
we have done: 
- lemmatization
- deleting the stopwords
- 

In [38]:
stopWords = set(stopwords.words('english'))
porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

In [39]:
def preprocessArticles(df, tokenizer=word_tokenize, stemmer=None, lemmatizer=None, useLemmatizer=False):
    tokens = tokenizer(df['content'].lower())
    terms = [word for word in tokens if word.isalpha() and word not in stopWords] # remove stopwords and non-alphabetic words
    if stemmer:
        processed = [stemmer.stem(word) for word in terms]
    elif useLemmatizer and lemmatizer:
        processed = [lemmatizer.lemmatize(word) for word in terms]
    else:
        processed = terms
    return ' '.join(processed)
    

In [40]:
# Define preprocessing variations
variations_wordtokenizer = {
    "porter_stemmer": lambda row: preprocessArticles(row, tokenizer=word_tokenize, stemmer=porter),
    "lancaster_stemmer": lambda row: preprocessArticles(row, tokenizer=word_tokenize, stemmer=lancaster),
    "lemmatization": lambda row: preprocessArticles(row, tokenizer=word_tokenize, lemmatizer=lemmatizer, useLemmatizer=True)
}

# Apply variations without modifying the original DataFrame
results_wordtokenizer = pd.DataFrame({
    "title": df["title"],
    "original_content": df["content"]
})

for name, preprocess_function in variations_wordtokenizer.items():
    # Apply each variation to the content column using the original function
    results_wordtokenizer[name] = df.apply(preprocess_function, axis=1)


variations_wordpunct = {
    "porter_stemmer": lambda row: preprocessArticles(row, tokenizer=word_tokenize, stemmer=porter),
    "lancaster_stemmer": lambda row: preprocessArticles(row, tokenizer=word_tokenize, stemmer=lancaster),
    "lemmatization": lambda row: preprocessArticles(row, tokenizer=word_tokenize, lemmatizer=lemmatizer, useLemmatizer=True)
}

# Apply variations without modifying the original DataFrame
results_wordpunct = pd.DataFrame({
    "title": df["title"],
    "original_content": df["content"]
})

for name, preprocess_function in variations_wordpunct.items():
    # Apply each variation to the content column using the original function
    results_wordpunct[name] = df.apply(preprocess_function, axis=1)


In [41]:
print("Word Tokenizer")
columns_to_display = ["title", "original_content"] + list(variations_wordtokenizer.keys())
results_wordtokenizer[columns_to_display].head()

Word Tokenizer


Unnamed: 0,title,original_content,porter_stemmer,lancaster_stemmer,lemmatization
0,Machine learning,Machine learning (ML) is a field of study in a...,machin learn ml field studi artifici intellig ...,machin learn ml field study art intellig conce...,machine learning ml field study artificial int...
1,Machine Learning (journal),Machine Learning is a peer-reviewed scientifi...,machin learn scientif journal publish sinc for...,machin learn sci journ publ sint forty edit me...,machine learning scientific journal published ...
2,Statistical learning in language acquisition,Statistical learning is the ability for humans...,statist learn abil human anim extract statist ...,stat learn abl hum anim extract stat regul wor...,statistical learning ability human animal extr...
3,Data mining,Data mining is the process of extracting and d...,data mine process extract discov pattern larg ...,dat min process extract discov pattern larg da...,data mining process extracting discovering pat...
4,Supervised learning,"In machine learning, supervised learning (SL) ...",machin learn supervis learn sl paradigm model ...,machin learn superv learn sl paradigm model tr...,machine learning supervised learning sl paradi...


In [42]:
print("Word Punct Tokenizer")
columns_to_display = ["title", "original_content"] + list(variations_wordpunct.keys())
results_wordpunct[columns_to_display].head()

Word Punct Tokenizer


Unnamed: 0,title,original_content,porter_stemmer,lancaster_stemmer,lemmatization
0,Machine learning,Machine learning (ML) is a field of study in a...,machin learn ml field studi artifici intellig ...,machin learn ml field study art intellig conce...,machine learning ml field study artificial int...
1,Machine Learning (journal),Machine Learning is a peer-reviewed scientifi...,machin learn scientif journal publish sinc for...,machin learn sci journ publ sint forty edit me...,machine learning scientific journal published ...
2,Statistical learning in language acquisition,Statistical learning is the ability for humans...,statist learn abil human anim extract statist ...,stat learn abl hum anim extract stat regul wor...,statistical learning ability human animal extr...
3,Data mining,Data mining is the process of extracting and d...,data mine process extract discov pattern larg ...,dat min process extract discov pattern larg da...,data mining process extracting discovering pat...
4,Supervised learning,"In machine learning, supervised learning (SL) ...",machin learn supervis learn sl paradigm model ...,machin learn superv learn sl paradigm model tr...,machine learning supervised learning sl paradi...


In [43]:
df['processedContent'] = df.apply(lambda row: preprocessArticles(row, tokenizer=word_tokenize, stemmer=porter, 
                                                          lemmatizer=None, useLemmatizer=False), axis=1)
saveDatabase(df, 'processed_articles.csv')

columnstoUse = ['title', 'content','processedContent']
df[columnstoUse].head()

Unnamed: 0,title,content,processedContent
0,Machine learning,Machine learning (ML) is a field of study in a...,machin learn ml field studi artifici intellig ...
1,Machine Learning (journal),Machine Learning is a peer-reviewed scientifi...,machin learn scientif journal publish sinc for...
2,Statistical learning in language acquisition,Statistical learning is the ability for humans...,statist learn abil human anim extract statist ...
3,Data mining,Data mining is the process of extracting and d...,data mine process extract discov pattern larg ...
4,Supervised learning,"In machine learning, supervised learning (SL) ...",machin learn supervis learn sl paradigm model ...


### TF-IDF and Cosine Similarity

In [44]:
def tf_idf(df):
    tfidf = TfidfVectorizer(use_idf=True, smooth_idf=False)
    tfidf_matrix = tfidf.fit_transform(df['processedContent'])

    dfTFIDF = pd.DataFrame(tfidf_matrix.toarray(), index=df['title'], columns=tfidf.get_feature_names_out())
    return tfidf, dfTFIDF


In [45]:
tfidf, dfTFIDF = tf_idf(df)

dfTFIDF.head()

Unnamed: 0_level_0,aahc,abandon,abil,abl,abnorm,absenc,abstract,abus,academ,acceler,...,year,yet,yield,yim,yoshua,young,youtub,yu,zero,zip
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Machine learning,0.0,0.005643,0.007322,0.007322,0.007141,0.004766,0.004766,0.007141,0.004766,0.007141,...,0.004766,0.014297,0.009532,0.0,0.0,0.0,0.0,0.0,0.004766,0.014283
Machine Learning (journal),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080419,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Statistical learning in language acquisition,0.0,0.0,0.049931,0.116505,0.0,0.0,0.016249,0.0,0.0,0.0,...,0.0,0.005416,0.0,0.008116,0.0,0.048697,0.0,0.024348,0.0,0.0
Data mining,0.013043,0.0,0.006687,0.013374,0.0,0.0,0.0,0.0,0.008704,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Supervised learning,0.0,0.0,0.0,0.051182,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
def recommendArticles(history, df, top_n=5):
    tfidf, dfTFIDF = tf_idf(df)
    df = df.set_index('title')
    
    historyContent = ' '.join(df.loc[history,'processedContent'])
    historyVector = tfidf.transform([historyContent]).toarray()[0]
    
    cosineDistance = dfTFIDF.apply(lambda row: cosine(row, historyVector), axis=1)
    similarityScores = 1 - cosineDistance
    
    recommendations = pd.DataFrame({
        'title': dfTFIDF.index, 
        'link': df.loc[dfTFIDF.index, 'link'],
        'similarity': similarityScores
    })
    
    # we exclude the articles that user have already seen from the recommendations
    recommendations = recommendations[~recommendations['title'].isin(history)]
    
    recommendations = recommendations.sort_values(by='similarity', ascending=False).reset_index(drop=True)
    
    recommendations = recommendations.head(top_n)

    
    recommendations = HTML(recommendations.to_html(render_links=True, escape=False))
    
    return recommendations

In [47]:
history = ['Machine learning', 'Supervised learning']
recommendations = recommendArticles(history, df, top_n=5)
recommendations

Unnamed: 0,title,link,similarity
0,Meta-learning (computer science),https://en.wikipedia.org/wiki/Meta-learning_(computer_science),0.574335
1,Weak supervision,https://en.wikipedia.org/wiki/Semi-supervised_learning,0.535394
2,Self-supervised learning,https://en.wikipedia.org/wiki/Self-supervised_learning,0.533141
3,Unsupervised learning,https://en.wikipedia.org/wiki/Unsupervised_learning,0.500827
4,Reinforcement learning,https://en.wikipedia.org/wiki/Reinforcement_learning,0.417073


----------------

### Adding articles in user history that are not in the database
Since the Wikipedia has more than 6 million articles in english we may encounter the situation that the user has seen the article that is not in our rather small database; <br>
Because of that this part is added to deal with such problems by adding to the database the title, link and content of unknown for us article.

In [None]:
# history = ['Machine learning', 'Aliens']
# recommendations = recommendArticles(history, df, top_n=5)
# recommendations

KeyError: "['Aliens'] not in index"

In [49]:
def fetchUnknownArticle(unknowTitle):
    base_url = "https://en.wikipedia.org/wiki/"
    url = base_url + unknowTitle.replace(' ', '_')
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        paragraphs = soup.find_all('p')
        content = ' '.join([p.text for p in paragraphs if p.text])
        
        #to preprocess the content we simulate dataframe row
        row = pd.Series({"content": content})
        
        processedArticle = preprocessArticles(row, tokenizer=word_tokenize, stemmer=porter, 
                                                          lemmatizer=None, useLemmatizer=False)
        
        
        newArticle = {"title": unknowTitle, "link": url, "content": content, 
                      "processedContent": processedArticle}
        return newArticle
    except:
        print(f"Failed to fetch article {unknowTitle}. Please check the title and try again.")

In [50]:
def additionalArticles(newArticle, df, maxExpansion=5):
    mainArticle = fetchUnknownArticle(newArticle)
    if mainArticle is None:
        return df
    
    newCrawledArticles = crawlArticles(mainArticle['link'], max_articles=maxExpansion)
    
    processedNewArticles=[]
    for article in newCrawledArticles:
        if article['title'] not in df['title'].values:
            row = pd.Series({"content": article['content']})
            article['processedContent'] = preprocessArticles(row, tokenizer=word_tokenize, stemmer=porter, 
                                                          lemmatizer=None, useLemmatizer=False)
            processedNewArticles.append(article)
            
    return processedNewArticles

In [51]:
def expandDatabase(history, df):
    unknownTitles = [title for title in history if title not in df['title'].values]
    allNewArticles = []
    
    for title in unknownTitles:
        articleData = fetchUnknownArticle(title)
        if articleData:
            allNewArticles.append(articleData)
            crawledArticles = additionalArticles(title, df)
            allNewArticles.extend(crawledArticles)
            
    if allNewArticles:
        additionaldf = pd.DataFrame(allNewArticles)
        df = pd.concat([df, additionaldf], ignore_index=True)
        df = df.drop_duplicates(subset='title')
        saveDatabase(df, 'articles.csv')
    return df
        

In [52]:
def recommendArticles(history, df, top_n=5):
    
    df = expandDatabase(history, df)
    
    tfidf, dfTFIDF = tf_idf(df)
    df = df.set_index('title')
    
    historyContent = ' '.join(df.loc[history,'processedContent'])
    historyVector = tfidf.transform([historyContent]).toarray()[0]
    
    cosineDistance = dfTFIDF.apply(lambda row: cosine(row, historyVector), axis=1)
    similarityScores = 1 - cosineDistance
    
    recommendations = pd.DataFrame({
        'title': dfTFIDF.index, 
        'link': df.loc[dfTFIDF.index, 'link'],
        'similarity': similarityScores
    })
    
    # we exclude the articles that user have already seen from the recommendations
    recommendations = recommendations[~recommendations['title'].isin(history)]
    
    recommendations = recommendations.sort_values(by='similarity', ascending=False).reset_index(drop=True)
    
    recommendations = recommendations.head(top_n)

    
    recommendations = HTML(recommendations.to_html(render_links=True, escape=False))
    
    return recommendations

In [56]:
history = ['Machine learning', 'Extraterrestrial life']
recommendations = recommendArticles(history, df, top_n=5)
recommendations

Unnamed: 0,title,link,similarity
0,Life,https://en.wikipedia.org/wiki/Life,0.455563
1,Meta-learning (computer science),https://en.wikipedia.org/wiki/Meta-learning_(computer_science),0.446897
2,Self-supervised learning,https://en.wikipedia.org/wiki/Self-supervised_learning,0.419621
3,Weak supervision,https://en.wikipedia.org/wiki/Semi-supervised_learning,0.411906
4,Abiogenesis,https://en.wikipedia.org/wiki/Abiogenesis,0.400445


(10, 4)