In [86]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import math
from collections import Counter
import nltk


In [87]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [88]:
!pip install faker



In [89]:

# Preprocessing function to clean and normalize text
def preprocess_and_normalize(text):
    # Remove HTML tags and CSS styles
    soup = BeautifulSoup(text, 'html.parser')
    clean_content = soup.get_text()
    clean_content = re.sub(r'<style.*?</style>', '', clean_content, flags=re.DOTALL)
    clean_content = re.sub(r'<link.*?>', '', clean_content, flags=re.DOTALL)

    # Convert text to lowercase
    text = clean_content.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Return preprocessed text as a single string
    return ' '.join(lemmatized_tokens)


In [102]:
from faker import Faker

fake = Faker()

def generate_player_document(player_name, custom_word_list):
    document = f"Player Profile: {player_name}\n\n"
    document += f"Name: {player_name}\n"
    document += f"Nationality: {'Brazilian' if player_name == 'Neymar' else 'Argentinian' if player_name == 'Messi' else 'Portuguese'}\n"
    document += f"Date of Birth: {fake.date_of_birth(minimum_age=18, maximum_age=40).strftime('%Y-%m-%d')}\n"
    document += f"Place of Birth: {'Mogi das Cruzes, São Paulo, Brazil' if player_name == 'Neymar' else 'Rosario, Santa Fe, Argentina' if player_name == 'Messi' else 'Funchal, Madeira, Portugal'}\n"
    document += f"Height: {fake.random_int(min=170, max=180)} cm\n"
    document += f"Weight: {fake.random_int(min=60, max=75)} kg\n"
    document += f"Current Club: {'Paris Saint-Germain (PSG)' if player_name == 'Neymar' or player_name == 'Messi' else 'Manchester United'}\n"
    document += f"Position: Forward\n"
    document += f"Preferred Foot: {'Left' if fake.boolean() else 'Right'}\n"
    document += f"Market Value: {'€128 million' if player_name == 'Neymar' else '€100 million' if player_name == 'Messi' else '€110 million'}\n"
    document += "Biography:\n"
    document += fake.paragraph(nb_sentences=60, variable_nb_sentences=True, ext_word_list=custom_word_list)
    return document


In [103]:
players = ["Neymar", "Messi", "Ronaldo"]
documents = []
#Custom word lists for each player
neymar_word_list = ["Neymar", "Brazil","sport", "football", "soccer", "PSG", "Barcelona", "goal","player","hellal"]
messi_word_list = ["Messi", "Argentina", "football", "soccer", "Barcelona","sport", "goal","good","best","hero"]
ronaldo_word_list = ["Ronaldo", "Portugal", "football", "soccer", "Manchester United","sport", "Real Madrid", "goal"]

# Generate documents for players
for player in players:
    if player=='Neymar':
      document = generate_player_document(player,neymar_word_list)
      documents.append(document)
    elif player=='Messi':
      document = generate_player_document(player,messi_word_list)
      documents.append(document)
    else :
      document = generate_player_document(player,ronaldo_word_list)
      documents.append(document)


In [104]:
# Print all documents
for i, document in enumerate(documents, 1):
    print(f"Document {i}:\n{document}\n{'='*50}\n")

Document 1:
Player Profile: Neymar

Name: Neymar
Nationality: Brazilian
Date of Birth: 1993-07-01
Place of Birth: Mogi das Cruzes, São Paulo, Brazil
Height: 174 cm
Weight: 64 kg
Current Club: Paris Saint-Germain (PSG)
Position: Forward
Preferred Foot: Right
Market Value: €128 million
Biography:
Neymar hellal player Neymar Neymar. Brazil goal player Neymar. Neymar goal player PSG. Hellal football PSG soccer Brazil goal player. Sport football Brazil. Soccer Neymar Barcelona player sport. Soccer Neymar soccer player player. Barcelona sport goal PSG. Neymar Brazil goal football goal. Player football player Neymar football soccer. Sport PSG Brazil Brazil player goal goal. Barcelona Barcelona soccer Neymar soccer sport PSG. Brazil sport Barcelona Barcelona football PSG player. Goal football Brazil goal. Hellal Brazil soccer goal football Neymar. Sport PSG Neymar PSG sport soccer. Barcelona hellal PSG hellal Neymar sport. Hellal goal soccer Brazil goal PSG. Player player Brazil player. Goal P

In [93]:
# Preprocess the documents
preprocessed_documents = [preprocess_and_normalize(doc) for doc in documents]


In [94]:
# Extract unique words from all documents
unique_words = set()
for doc in preprocessed_documents:
    unique_words.update(doc.split())

# Print unique words extracted from the documents
print("Unique Words:")
print(unique_words)
print('Number Of Unique Words : ', len(unique_words))

Unique Words:
{'madrid', 'madeira', 'portuguese', 'portugal', 'paulo', 'barcelona', 'current', 'manchester', 'place', 'left', '172', 'brazil', 'million', 'argentinian', 'paris', 'ronaldo', 'argentina', 'goal', 'brazilian', 'cruzes', 'fe', 'preferred', '19870614', 'mogi', 'rosario', 'da', '75', 'club', 'hellal', 'good', 'profile', 'cm', '19830517', 'position', 'hero', '19990711', 'psg', '175', 'value', 'name', 'birth', 'right', 'weight', 'market', 'biography', 'são', 'best', 'united', 'nationality', 'height', '128', 'sport', 'messi', 'neymar', 'santa', 'forward', 'football', '100', 'real', 'player', 'kg', 'funchal', '110', 'saintgermain', 'soccer', 'date', 'foot'}
Number Of Unique Words :  67


In [95]:
# Calculate TF-IDF using built-in functions (TfidfVectorizer from scikit-learn)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed documents to obtain the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

# Get the feature names (unique words) from the TfidfVectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print the shape of the feature names array and the TF-IDF matrix
print('Shape of Feature Names:', feature_names.shape)
print('Shape of TF_IDF Matrix:', tfidf_matrix.shape)

# Print TF-IDF using built-in functions
print("\nTF-IDF using built-in functions:")
print("Feature Names:", feature_names)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


Shape of Feature Names: (67,)
Shape of TF_IDF Matrix: (3, 67)

TF-IDF using built-in functions:
Feature Names: ['100' '110' '128' '172' '175' '19830517' '19870614' '19990711' '75'
 'argentina' 'argentinian' 'barcelona' 'best' 'biography' 'birth' 'brazil'
 'brazilian' 'club' 'cm' 'cruzes' 'current' 'da' 'date' 'fe' 'foot'
 'football' 'forward' 'funchal' 'goal' 'good' 'height' 'hellal' 'hero'
 'kg' 'left' 'madeira' 'madrid' 'manchester' 'market' 'messi' 'million'
 'mogi' 'name' 'nationality' 'neymar' 'paris' 'paulo' 'place' 'player'
 'portugal' 'portuguese' 'position' 'preferred' 'profile' 'psg' 'real'
 'right' 'ronaldo' 'rosario' 'saintgermain' 'santa' 'soccer' 'sport' 'são'
 'united' 'value' 'weight']
TF-IDF Matrix:
 [[0.         0.         0.02043954 0.         0.0155448  0.
  0.02043954 0.         0.01207192 0.         0.         0.20208237
  0.         0.01207192 0.02414384 0.42923036 0.02043954 0.01207192
  0.01207192 0.02043954 0.01207192 0.02043954 0.01207192 0.
  0.01207192 0.39

In [96]:
# Create a set to store each document's TF-IDF vectors
document_tfidf_sets = []

# Iterate over each document and its corresponding TF-IDF vector
print("\nTF-IDF using built-in functions:")
for i, doc in enumerate(preprocessed_documents, start=1):
    # Print the TF-IDF Vector for the current document
    print(f"\nTF-IDF Vector for Document {i}:")

    # Create a set to store the TF-IDF vectors for the current document
    tfidf_set = set()

    # Iterate over each word and its corresponding TF-IDF value
    for word_index, word in enumerate(feature_names):
        # Access the TF-IDF value from the TF-IDF matrix
        tfidf_value = tfidf_matrix[i-1, word_index]

        # Check if the TF-IDF value is non-zero
        if tfidf_value > 0:
            # Add the word and its TF-IDF value to the set
            tfidf_set.add((word, tfidf_value))

    # Print the TF-IDF vector for the current document
    print(tfidf_set)

    # Add the TF-IDF set for the current document to the list of document sets
    document_tfidf_sets.append(tfidf_set)




TF-IDF using built-in functions:

TF-IDF Vector for Document 1:
{('hellal', 0.38835128062147467), ('weight', 0.012071922228628204), ('foot', 0.012071922228628204), ('right', 0.015544797833836685), ('value', 0.012071922228628204), ('height', 0.012071922228628204), ('biography', 0.012071922228628204), ('neymar', 0.3474721984507932), ('soccer', 0.2776542112584487), ('current', 0.012071922228628204), ('market', 0.012071922228628204), ('são', 0.020439541085340775), ('nationality', 0.012071922228628204), ('football', 0.3983734335447307), ('brazilian', 0.020439541085340775), ('paulo', 0.020439541085340775), ('profile', 0.012071922228628204), ('128', 0.020439541085340775), ('19870614', 0.020439541085340775), ('brazil', 0.42923036279215626), ('psg', 0.3419855523444071), ('million', 0.012071922228628204), ('barcelona', 0.2020823718398769), ('date', 0.012071922228628204), ('place', 0.012071922228628204), ('75', 0.012071922228628204), ('175', 0.015544797833836685), ('mogi', 0.020439541085340775),

In [97]:
# Function to calculate term frequency (TF) for a term in a document
def term_frequency(term, document):
    # Count the occurrences of the term in the document
    word_counts = Counter(document.split())
    # Calculate the term frequency (TF) as the ratio of term occurrences to total words in the document
    tf = word_counts[term] / sum(word_counts.values())
    return tf


In [98]:
# Function to calculate inverse document frequency (IDF) for a term across all documents
def inverse_document_frequency(term, documents):
    # Count the number of documents that contain the term
    num_documents_with_term = sum(1 for document in documents if term in document)
    # Calculate the IDF using the logarithm of the ratio of total documents to the number of documents containing the term
    return math.log((1 + len(documents)) / (1 + num_documents_with_term)) + 1


In [99]:
# Function to calculate TF-IDF for a term in a document
def tfidf(term, document, documents):
    # Calculate TF using the term_frequency function
    tf = term_frequency(term, document)
    # Calculate IDF using the inverse_document_frequency function
    idf = inverse_document_frequency(term, documents)
    # Calculate TF-IDF score
    tfidf_score = tf * idf
    return tfidf_score

In [100]:
# Calculate TF-IDF vectors for each document from scratch
import math
# Initialize an empty list to store TF-IDF vectors for each document
tfidf_vectors = []

# Iterate through each preprocessed document
for doc in preprocessed_documents:
    # Initialize an empty dictionary to store TF-IDF scores for terms in the document
    document_vector = {}
    # Split the document into terms and iterate through each term
    for term in doc.split():
        # Calculate the TF-IDF score for the term in the current document
        score = tfidf(term, doc, preprocessed_documents)
        # Add the term and its TF-IDF score to the document vector dictionary
        document_vector[term] = score
    # Append the document vector dictionary to the list of TF-IDF vectors
    tfidf_vectors.append(document_vector)

# Print TF-IDF vectors calculated from scratch
print("\nTF-IDF from scratch:")
for i, vector in enumerate(tfidf_vectors, start=1):
    print(f"TF-IDF Vector for Document {i}: {vector}")



TF-IDF from scratch:
TF-IDF Vector for Document 1: {'player': 0.0847457627118644, 'profile': 0.00423728813559322, 'neymar': 0.12196399181999606, 'name': 0.00423728813559322, 'nationality': 0.00423728813559322, 'brazilian': 0.0071743524599997685, 'date': 0.00423728813559322, 'birth': 0.00847457627118644, '19870614': 0.0071743524599997685, 'place': 0.00423728813559322, 'mogi': 0.0071743524599997685, 'da': 0.00423728813559322, 'cruzes': 0.0071743524599997685, 'são': 0.0071743524599997685, 'paulo': 0.0071743524599997685, 'brazil': 0.15066140165999514, 'height': 0.00423728813559322, '175': 0.00545627996801602, 'cm': 0.00423728813559322, 'weight': 0.00423728813559322, '75': 0.00423728813559322, 'kg': 0.00423728813559322, 'current': 0.00423728813559322, 'club': 0.00423728813559322, 'paris': 0.00545627996801602, 'saintgermain': 0.00545627996801602, 'psg': 0.12003815929635245, 'position': 0.00423728813559322, 'forward': 0.00423728813559322, 'preferred': 0.00423728813559322, 'foot': 0.004237288

In [101]:
# Calculate TF-IDF scores for all terms in all documents
tfidf_scores = {}
for term in set(term for document in preprocessed_documents for term in document.split()):
    tfidf_scores[term] = [tfidf(term, doc, preprocessed_documents) for doc in preprocessed_documents]

# Calculate the sum of squares of all TF-IDF scores
sum_of_squares = sum(tfidf_score ** 2 for doc_scores in tfidf_scores.values() for tfidf_score in doc_scores)

# Normalize TF-IDF scores
normalized_tfidf_scores = {}
for term, doc_scores in tfidf_scores.items():
    normalized_tfidf_scores[term] = [tfidf_score / math.sqrt(sum_of_squares) for tfidf_score in doc_scores]
print(normalized_tfidf_scores)

{'madrid': [0.0, 0.0, 0.20831595526523713], 'madeira': [0.0, 0.0, 0.008012152125586041], 'portuguese': [0.0, 0.0, 0.008012152125586041], 'portugal': [0.0, 0.0, 0.19229165101406503], 'paulo': [0.010524437114117258, 0.0, 0.0], 'barcelona': [0.10405337406649441, 0.17589589362595195, 0.0], 'current': [0.006215902099330014, 0.005939080548347706, 0.004732106114328656], 'manchester': [0.0, 0.0, 0.2804253243955115], 'place': [0.006215902099330014, 0.005939080548347706, 0.004732106114328656], 'left': [0.0, 0.0, 0.008012152125586041], '172': [0.0, 0.0, 0.008012152125586041], 'brazil': [0.22101317939646245, 0.0, 0.0], 'million': [0.006215902099330014, 0.005939080548347706, 0.004732106114328656], 'argentinian': [0.0, 0.010055737485553332, 0.0], 'paris': [0.008004105697422648, 0.007647647548954433, 0.0], 'ronaldo': [0.0, 0.0, 0.26440102014433936], 'argentina': [0.0, 0.27150491210994, 0.0], 'goal': [0.11188623778794027, 0.15441609425704034, 0.14196318342985967], 'brazilian': [0.010524437114117258, 0