In [1]:
import os
import pandas as pd
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the directory paths
data_dir = r'D:\Research\Python\Data\WIP\Google search data Preprocessed'
output_dir = r'D:\Research\Python\Data\WIP\Final Association Rated'

# List all files in the directory
file_list = [file for file in os.listdir(data_dir) if file.endswith('_preprocessed.csv')]

# Initialize an empty list to store the top rows
top_rows = []

# Process each file
for file in file_list:
    if file != 'Search_Terms_Hashtags_preprocessed.csv':
        # Extract date from the file name
        date_str = file.split('_')[2]  # Assuming the date is at the third position
        date = pd.to_datetime(date_str, format='%m-%d-%Y')  # Corrected format
        
        # Load the dataset and drop null rows from 'Combination' and 'Combined_text_preprocessed'
        df = pd.read_csv(os.path.join(data_dir, file)).dropna(subset=['combination', 'Combined_text_preprocessed'])
        
        # Calculate cosine similarity for each row
        vectorizer = TfidfVectorizer()
        tfidf_matrix_combined = vectorizer.fit_transform(df['combination'])
        tfidf_matrix_text = vectorizer.transform(df['Combined_text_preprocessed'])
        cosine_similarity_scores = cosine_similarity(tfidf_matrix_combined, tfidf_matrix_text)
        
        # Calculate SequenceMatcher similarity for each row
        sequence_similarity_scores = []
        for index, row in df.iterrows():
            combination = row['combination']
            combined_text = row['Combined_text_preprocessed']
            sequence_similarity = SequenceMatcher(None, combination, combined_text).ratio()
            sequence_similarity_scores.append(sequence_similarity)
        
        # Calculate Levenshtein distance for each row
        levenshtein_distances = []
        for index, row in df.iterrows():
            combination = row['combination']
            combined_text = row['Combined_text_preprocessed']
            levenshtein_distance = fuzz.ratio(combination, combined_text)
            max_length_of_strings = max(len(combination), len(combined_text))
            normalized_levenshtein = levenshtein_distance / max_length_of_strings
            levenshtein_distances.append(normalized_levenshtein)
        
        # Define weights for each metric (sum of weights should be 1)
        weight_cosine = 0.4
        weight_sequence = 0.3
        weight_levenshtein = 0.3
        
        # Calculate overall scores for each row
        overall_scores = []
        for i in range(len(df)):
            cosine_normalized = cosine_similarity_scores[i].mean()  # Average cosine similarity
            sequence_normalized = sequence_similarity_scores[i]
            levenshtein_normalized = 1 - levenshtein_distances[i]  # Inverted for correct interpretation
            
            overall_score = (
                weight_cosine * cosine_normalized +
                weight_sequence * sequence_normalized +
                weight_levenshtein * levenshtein_normalized
            )
            overall_scores.append(overall_score)
        
        # Add the overall scores to the DataFrame
        df['Cosine_Similarity'] = cosine_similarity_scores.mean(axis=1)
        df['Sequence_Similarity'] = sequence_similarity_scores
        df['Levenshtein_Distance'] = levenshtein_distances
        df['Overall_Score'] = overall_scores
        
        # Select the top rows by Overall_Score and add to the list
        top_rows.append(df.nlargest(10, 'Overall_Score'))  # Keep the top rows
        
        # Save the modified dataframe with a new name
        new_filename = file.replace('_preprocessed.csv', f'_scores_{date_str}.csv')
        df.to_csv(os.path.join(output_dir, new_filename), index=False)

# Concatenate the top rows into a single dataframe
final_df = pd.concat(top_rows, ignore_index=True)

# Save the concatenated dataframe with top rows
final_df.to_csv(os.path.join(output_dir, 'final_association_rated.csv'), index=False)

print("Process completed.")




Process completed.
