In [None]:
import pandas as pd
import re
import math
from collections import Counter

def preprocess_text(text):
    # Remove punctuation and special characters using regex
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    return text

def compute_tf(word_freq, document):
    tf = {}
    total_words = len(document)
    for word, freq in word_freq.items():
        tf[word] = freq / total_words
    return tf

def compute_idf(documents, word):
    num_documents_with_word = sum([1 for doc in documents if word in doc])
    if num_documents_with_word > 0:
        return math.log(len(documents) / num_documents_with_word)
    else:
        return 0.0

def compute_tfidf(tf, idf):
    tfidf = {}
    for word, tf_value in tf.items():
        tfidf[word] = tf_value * idf[word]
    return tfidf

# Read data from Excel file
excel_file_path = '/content/Copy of TF IDF ALBERT SCHOOL.xlsx'
df = pd.read_excel(excel_file_path)

# Preprocess text in the 'column_name' column
df['processed_text'] = df['product_name'].apply(preprocess_text)

# Tokenize and calculate word frequencies
word_freqs = [Counter(doc.split()) for doc in df['processed_text']]

# TF-IDF analysis
tfidf_values = []
for i, doc in enumerate(df['processed_text']):
    tf_values = compute_tf(word_freqs[i], doc.split())
    tfidf_doc = {}
    for word, tf_value in tf_values.items():
        idf_value = compute_idf(df['processed_text'], word)
        tfidf_doc[word] = tf_value * idf_value
    tfidf_values.append(tfidf_doc)

# Print TF-IDF values
for i, tfidf_doc in enumerate(tfidf_values):
    print(f"TF-IDF for Document {i + 1}: {tfidf_doc}")

#Delete punctuation and special characters

word_freqs = [Counter(doc.split()) for doc in df['processed_text']]

processed_word_freqs = [Counter(preprocess_text(doc).split()) for doc in df['product_name']]



TF-IDF for Document 1: {'xiaomi': 0.1308602089131961, 'mi': 0.08315843148870618, '9t': 0.21240693603789554, 'pro': 0.09794163315228889, 'smartphone': 0.09008684535084639, '6': 0.03058090936975933, '128': 0.09794163315228889, 'go': 0.010407688715239897, 'redmi': 0.1308602089131961, 'k20': 0.21240693603789554, 'chargeur': 0.21240693603789554, 'rapide': 0.21240693603789554, '4000mah': 0.21240693603789554, '27w': 0.21240693603789554, '639': 0.21240693603789554, 'flamme': 0.21240693603789554, 'rouge': 0.1716335724755458}
TF-IDF for Document 2: {'huawei': 0.10111925234201519, 'honor': 0.13262594236746722, 'v10': 0.16413263239291928, 'view': 0.10111925234201519, '10': 0.05946967362046268, 'smartphone': 0.06961256231656313, '4g': 0.06961256231656313, '6go': 0.11419571018073249, '64go': 0.11419571018073249, '599android': 0.16413263239291928, '80': 0.11419571018073249, 'kirin': 0.16413263239291928, '970': 0.16413263239291928, '236': 0.16413263239291928, 'ghz': 0.13262594236746722, 'octa': 0.1641

In [None]:

# Assuming tfidf_values is a list of dictionaries containing TF-IDF values
# Example structure: [{'word1': tfidf1, 'word2': tfidf2, ...}, {'word1': tfidf1, 'word2': tfidf2, ...}, ...]

# Create a DataFrame from tfidf_values
df_tfidf = pd.DataFrame(tfidf_values)

# Write to Excel file
excel_file_path_tfidf = '/content/Verif.xlsx'
df_tfidf.to_excel(excel_file_path_tfidf, index=False)

print(f"TF-IDF values written to Excel file: {excel_file_path_tfidf}")


TF-IDF values written to Excel file: /content/Verif.xlsx
