In [2]:
import os
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz

# Define the directory paths
data_dir = r'D:\Research\Python\Data\WIP\Google search data Preprocessed'
output_dir = r'D:\Research\Python\Data\WIP\Final Association Rated'

# Load keywords_df
keywords_df = pd.read_csv(os.path.join(data_dir, 'Search_Terms_Hashtags.csv'))

# Convert "Spike Date" column to datetime format
keywords_df['Spike Date'] = pd.to_datetime(keywords_df['Spike Date'], format='%m/%d/%Y')

# List all files in the directory
file_list = [file for file in os.listdir(data_dir) if file.endswith('_preprocessed.csv')]

# Initialize an empty list to store the top rows
top_rows = []

# Process each file
for file in file_list:
    if file != 'Search_Terms_Hashtags_preprocessed.csv':
        # Extract date from the file name
        date_str = file.split('_')[2]  # Assuming the date is at the third position
        date = pd.to_datetime(date_str, format='%m-%d-%Y')  # Corrected format
        
        # Load the dataset and drop null rows from 'Combined_text_preprocessed'
        df = pd.read_csv(os.path.join(data_dir, file)).dropna(subset=['Combined_text_preprocessed'])
        
        # Find matching row in keywords_df based on Spike Date
        matching_row = keywords_df[keywords_df['Spike Date'] == date]
        
        if not matching_row.empty:
            # Get the Terms column
            terms = matching_row['Terms'].iloc[0]  # Extract the string
            
            # Calculate cosine similarity for each row
            vectorizer = TfidfVectorizer()
            tfidf_matrix_terms = vectorizer.fit_transform([terms])
            cosine_similarity_scores = cosine_similarity(tfidf_matrix_terms, vectorizer.transform(df['Combined_text_preprocessed']))
            
            # Calculate SequenceMatcher similarity for each row
            sequence_similarity_scores = []
            for index, row in df.iterrows():
                combined_text = row['Combined_text_preprocessed']
                sequence_similarity = SequenceMatcher(None, terms, combined_text).ratio()
                sequence_similarity_scores.append(sequence_similarity)
            
            # Calculate Levenshtein distance for each row
            levenshtein_distances = []
            for index, row in df.iterrows():
                combined_text = row['Combined_text_preprocessed']
                levenshtein_distance = fuzz.ratio(terms, combined_text)
                # Normalize Levenshtein distance to a scale between 0 and 1
                normalized_levenshtein = 1 - (levenshtein_distance / 40)  
                levenshtein_distances.append(normalized_levenshtein)
            
            # Add the scores to the dataframe
            df['Cosine_Similarity'] = cosine_similarity_scores[0]
            df['Sequence_Similarity'] = sequence_similarity_scores
            df['Levenshtein_Distance'] = levenshtein_distances
            
            # Calculate the overall score by summing up the scores
            df['Overall_Score'] = df['Cosine_Similarity'] + df['Sequence_Similarity'] + df['Levenshtein_Distance']
            
            # Select the top ten rows by Overall_Score and add to the list
            top_rows.append(df.nlargest(10, '   '))
            
            # Save the modified dataframe with a new name
            new_filename = file.replace('_preprocessed.csv', f'_scores_{date_str}.csv')
            df.to_csv(os.path.join(output_dir, new_filename), index=False)
        else:
            print(f"No matching row found for {file}")

# Concatenate the top rows into a single dataframe
final_df = pd.concat(top_rows, ignore_index=True)

# Save the concatenated dataframe
final_df.to_csv(os.path.join(output_dir, 'final_association_rated.csv'), index=False)

print("Process completed.")


Process completed.
