In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'seaborn'

In [3]:
class SubstitutionMatrix:
    """
    This class represents a substitution matrix for a given number of events.
    """
    def __init__(self, num_events):
        """
        Initializes the substitution matrix with 0
        """
        self.num_events = num_events + 2
        if not isinstance(num_events, int) or num_events <= 0:
            raise ValueError("Number of events must be a positive integer.")
        self.matrix = np.full((self.num_events, self.num_events), 0)
      
    def set_index(self, indexs):
        self.indexs = ["#"]+ indexs + ["*"]  #
    
    def read_matrix_from_file(self, filename):
        """Reads the matrix from a text file, assuming one row per line."""
        try:
            with open(filename, 'r') as file:
                for line in file:
                    row = list(map(float, line.strip().split()))
                    self.matrix.append(row)
        except FileNotFoundError:
            print(f"Error: File '{filename}' not found.")
            
    def iniate_scores(self, match_score, unmatch_score):
        self.matrix += np.diag(np.full(self.num_events, match_score - unmatch_score))
        self.matrix += np.full((self.num_events, self.num_events), unmatch_score)
        
    def set_score(self, event1, event2, score):
        self.df_matrix.loc[event1, event2] = score

    def set_score_2ways(self, event1, event2, score):
        self.df_matrix.loc[event1, event2] = score    
        self.df_matrix.loc[event2, event1] = score
        
    def read_matrix_from_df(self, df):
        self.df_matrix = df
    
    def to_df(self):
        self.df_matrix = pd.DataFrame(data = self.matrix, index = self.indexs, columns = self.indexs)
    
    def save_matrix_txt(self, path_out):
        np.savetxt(path_out, self.matrix, fmt = '%.1f')
    
    def save_df_matrix_txt(self, path_out):
        self.df_matrix.to_csv(path_out, sep='\t')
        
    def __str__(self):
        """
        Returns a string representation of the substitution matrix.
        """
        return self.df_matrix

def Initate_Submatrix(seq1, seq2):
    unique_index = sorted(list(set(seq1+seq2)))
    #print (unique_index)
    SubMatrix = SubstitutionMatrix(len(unique_index))
    SubMatrix.set_index(unique_index)
    ## score for set_scores(match, mismatch) 
    ## set_scores(0, 1) means Levenshtein Distance
    SubMatrix.iniate_scores(0, 1)
    SubMatrix.to_df()
    return SubMatrix

def levenshtein_distance(seq1, seq2, df_sub_matrix):
    """
    Calculates the Levenshtein distance between two sequences.
    Args:
    seq1: The first sequence (string).
    seq2: The second sequence (string).
    Returns:
    The Levenshtein distance between the two sequences.
    """
    
    seq1 = "#"+seq1
    seq2 = "#"+seq2
    m = len(seq1)
    n = len(seq2)
    # Ref: https://medium.com/@ethannam/understanding-the-levenshtein-distance-equation-for-beginners-c4285a5604f0
    # Create a distance matrix
    dp = np.full((m, n), 0) 
    # Initialize the first row and column
    for i in range(1, m):
        dp[i][0] = i
    for j in range(1, n):
        dp[0][j] = j
    
    ## Fill the DP table
    for i in range(1, m):
        for j in range(1, n):
            insertion_cost = dp[i-1][j] + df_sub_matrix.loc[seq1[i-1], seq2[0]]  ## ( # -> letter)
            deletion_cost = dp[i][j-1] +  df_sub_matrix.loc[seq1[0], seq2[j-1]]  ## (letter -> # ) deletion
            cost = (0 if seq1[i] == seq2[j] else df_sub_matrix.loc[seq1[i], seq2[j]])
            substitution_cost = dp[i-1][j-1] + cost # ## only +1 when i j not same
            dp[i][j] = min(insertion_cost, deletion_cost, substitution_cost)
        #break
    print ("Dis: ", dp[m - 1][n - 1])
    return dp

def Levenshtein_Distance_with_Transposition_Date(seq1, seq2, dates1, dates2, df_sub_matrix, max_transposition_date):
  """
  Calculates the Levenshtein distance between two sequences, considering transpositions.
  Args:
    seq1: The first sequence.
    seq2: The second sequence.
    df_sub_matrix: Pandas DataFrame containing substitution costs.
  Returns:
    The Levenshtein distance between the two sequences.
  """
  seq1 = "#"+''.join(seq1)
  seq2 = "#"+''.join(seq2)
  dates1 = np.insert(dates1, 0,  "None")
  dates2 = np.insert(dates2, 0,  "None")  # Adding None to align dates with the prefixed '#'
  m = len(seq1)
  n = len(seq2)
  # Create a distance matrix
  dp = np.full((m, n), 0)  # Initialize with infinity to handle transpositions

  # Initialize the first row and column
  for i in range(1, m):
    dp[i][0] = i
  for j in range(1, n):
    dp[0][j] = j

  # Fill the DP table
  for i in range(1, m):
    for j in range(1, n):
      # Standard costs
      insertion_cost = dp[i-1][j] + df_sub_matrix.loc[seq1[i-1], seq2[0]]  # (letter -> # )
      deletion_cost = dp[i][j-1] + df_sub_matrix.loc[seq1[0], seq2[j-1]]  # ( # -> letter)
      # Substitution cost
      cost = (0 if seq1[i] == seq2[j] else df_sub_matrix.loc[seq1[i], seq2[j]])
      substitution_cost = dp[i-1][j-1] + cost
      dp[i][j] = min(insertion_cost, deletion_cost, substitution_cost)
      # Handle transpositions with date constraint
      if ((i > 0) & (j > 0)):
          for idx_seq1  in range(1, m, 1):
              date_diff1 = abs((dates1[idx_seq1] - dates2[j]).days)
              if (date_diff1 < max_transposition_date):
                  cost = (0 if seq1[idx_seq1] == seq2[j] else df_sub_matrix.loc[seq1[idx_seq1], seq2[j]])
                  transposition_cost = dp[i-1][j-1] + cost
                  dp[i][j] = min(dp[i][j], transposition_cost)
          for idx_seq2  in range(1, n, 1):
              date_diff2 = abs((dates1[i] - dates2[idx_seq2]).days)
              if (date_diff2 < max_transposition_date):
                  cost = (0 if seq1[i] == seq2[idx_seq2] else df_sub_matrix.loc[seq1[i], seq2[idx_seq2]])
                  transposition_cost = dp[i-1][j-1] + cost
                  dp[i][j] = min(dp[i][j], transposition_cost)

      #print (dp)
      #break
    #break
  # Return the distance and matrix (optional)
  return dp[m - 1][n - 1], dp

def Normalize_Levenshtein_Distance_Score(seq1, seq2, dates1, dates2, df_sub_matrix, max_transposition_date):
    ''' 0~1'''
    distance, matrix = Levenshtein_Distance_with_Transposition_Date(seq1, seq2, dates1, dates2, df_sub_matrix, max_transposition_date)
    print ("Leven_Distance", distance)

    seq1_match = [df_sub_matrix.loc[char, char] for char in seq1]
    seq2_match = [df_sub_matrix.loc[char, char] for char in seq2]

    seq1_penal = [df_sub_matrix.loc[char, "#"] for char in seq1] ## (letter -> # ) deletion
    seq2_penal = [df_sub_matrix.loc[char, "#"] for char in seq2]

    distance_max = max(sum(seq1_penal), sum(seq2_penal))
    distance_min = min(sum(seq1_match[::-1]), sum(seq2_match[::-1]))
    normalized_score = (distance - distance_min)/ (distance_max-distance_min)
    if (distance_max == distance_min): normalized_score = 1
    similarity_score = 1 - normalized_score
    return similarity_score

In [None]:
class Person:
  def __init__(self, name):
    self.name = name
    self.storys = [] # creates a new empty list for stories
  def add_story(self, story):
    self.storys.append(story)
  
  # Method to introduce the person
  def introduce(self):
    print(f"I'm {self.name}, {self.storys}")

# Create an instance of the Person class
person1 = Person("Alice")
person1.add_story("I love to travel and explore new cultures.")

# Call the introduce method on the instance
person1.introduce()
#person1