In [None]:
# Install required libraries
!pip install pandas numpy scikit-learn hmmlearn opencv-python matplotlib openpyxl

# Required libraries for handling data and modeling
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.preprocessing import StandardScaler
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# After uploading, specify the file names for easy reference
feature_file = '/content/drive/MyDrive/Colab Notebooks/SFR_DCT_LINE 2.xlsx'  # DCT features file (uploaded manually)
label_file = '/content/drive/MyDrive/Colab Notebooks/line_gt_1_1.xlsx'        # Label file (uploaded manually)


# Step 2: Load the feature and label data
# Load the DCT feature data and label data from provided files
features = pd.read_excel(feature_file)
labels = pd.read_excel(label_file)

# Display column names for both datasets
print("Features file columns:", features.columns)
print("Labels file columns:", labels.columns)


# Step 3: Merge features and labels on 'ImageName' for consistent mapping
# This ensures we have corresponding labels for each set of features
merged_data = pd.merge(features, labels, on='image_name')
merged_data.fillna(0, inplace=True)  # Fill NaN values with zeros

# Step 4: Preprocess the data for HMM training
# Extract feature matrix (excluding 'ImageName' and 'gt' columns)
X = merged_data.drop(columns=['image_name', 'gt']).values

# Scale features to mean 0 and variance 1 (to avoid numerical issues in HMM)
scaler = StandardScaler()
# Step 1: Drop non-numeric columns like 'color' if they exist in the features
X = merged_data.select_dtypes(include=[np.number]).values  # Only numeric data

# Step 2: Scale features to mean 0 and variance 1 (to avoid numerical issues in HMM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Map unique characters in ground truth ('gt') to integer values for training
unique_chars = sorted(set(''.join(merged_data['gt'].values)))
char_to_int = {char: idx for idx, char in enumerate(unique_chars)}
int_to_char = {idx: char for char, idx in char_to_int.items()}

# Convert 'gt' labels into sequences of integers for training
y_sequences = [[char_to_int[char] for char in label] for label in merged_data['gt'].values]

# Pad sequences to ensure consistent sequence lengths (fill with -1 for padding)
max_sequence_length = max(len(seq) for seq in y_sequences)
y_padded = np.array([seq + [-1] * (max_sequence_length - len(seq)) for seq in y_sequences])

# Step 5: Split the data into training and testing sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y_padded[:train_size], y_padded[train_size:]

# Step 6: Initialize and Train the HMM model using Baum-Welch algorithm
# Here, we define the HMM model with Gaussian emissions
n_states = 6  # Number of hidden states (adjust based on complexity)
hmm_model = hmm.GaussianHMM(n_components=n_states, covariance_type="spherical", n_iter=2000, init_params='')

# Set start and transition probabilities uniformly
hmm_model.startprob_ = np.full(n_states, 1.0 / n_states)
hmm_model.transmat_ = np.full((n_states, n_states), 1.0 / n_states)

# Baum-Welch algorithm for training the HMM
# This will optimize the HMM's parameters (transition and emission probabilities) based on training data
try:
    hmm_model.fit(X_train)
    print("Model successfully trained with Baum-Welch algorithm.")
except Exception as e:
    print(f"Error during training: {e}")

# -------------------------------------------------------
# TEST FEATURE PREDICTION PART (using DCT features)
# -------------------------------------------------------

# Function to predict character sequences using Viterbi algorithm
# The Viterbi algorithm is used here for decoding, providing the most likely sequence of hidden states
def predict_from_features(test_features):
    if test_features.shape[0] != X.shape[1]:  # Ensure correct number of features
        raise ValueError(f"Expected {X.shape[1]} features, but got {test_features.shape[0]}")

    # Reshape test features to match HMM input requirements
    test_features = test_features.reshape(1, -1)

    # Viterbi algorithm for decoding the most likely hidden state sequence
    # This outputs the sequence of states most likely responsible for the observed sequence
    try:
        predicted_states = hmm_model.predict(test_features)
        predicted_sequence = ''.join(int_to_char[state] for state in predicted_states if state != -1)
        print("Predicted sequence for the test features:", predicted_sequence)
    except ValueError as e:
        print(f"Prediction error: {e}")


    print("Predicted states:", predicted_states)

    predicted_sequence = ''.join(int_to_char[state] for state in predicted_states if state in int_to_char)

# Example: Use random test features (ensure it has the same length as X's feature dimension)
# Replace `test_features` with actual data for real predictions
test_features = np.random.rand(X.shape[1])  # Random example for testing purposes

# Run prediction function to obtain character sequence from test features
predict_from_features(test_features)


print("State to character mapping:", int_to_char)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Features file columns: Index(['Unnamed: 0',            0,            1,            2,            3,
                  4,            5,            6,            7,            8,
       ...
                192,          193,          194,          195,          196,
                197,          198,          199,        '0.1', 'image_name'],
      dtype='object', length=203)
Labels file columns: Index(['gt', 'image_name'], dtype='object')
Model successfully trained with Baum-Welch algorithm.
Predicted sequence for the test features: :
Predicted states: [1]
State to character mapping: {0: ' ', 1: ':', 2: '\xa0', 3: 'ം', 4: 'ഃ', 5: 'അ', 6: 'ആ', 7: 'ഇ', 8: 'ഉ', 9: 'എ', 10: 'ഏ', 11: 'ഒ', 12: 'ഓ', 13: 'ക', 14: 'ഖ', 15: 'ഗ', 16: 'ഘ', 17: 'ങ', 18: 'ച', 19: 'ഛ', 20: 'ജ', 21: 'ഝ', 22: 'ഞ', 23: 'ട', 24: 'ഠ', 25: 'ഡ', 26: 'ഢ', 27: 'ണ', 28: 'ത', 29: 'ഥ', 30: 'ദ', 31: 'ധ',

In [None]:
# Install required libraries
!pip install pandas numpy scikit-learn hmmlearn opencv-python matplotlib openpyxl

# Required libraries for handling data and modeling
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.preprocessing import StandardScaler
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# After uploading, specify the file names for easy reference
feature_file = '/content/drive/MyDrive/Colab Notebooks/SFR_DCT_LINE 2.xlsx'  # DCT features file (uploaded manually)
label_file = '/content/drive/MyDrive/Colab Notebooks/line_gt_1_1.xlsx'        # Label file (uploaded manually)


# Step 2: Load the feature and label data
# Load the DCT feature data and label data from provided files
features = pd.read_excel(feature_file)
labels = pd.read_excel(label_file)

# Display column names for both datasets
print("Features file columns:", features.columns)
print("Labels file columns:", labels.columns)


# Step 3: Merge features and labels on 'ImageName' for consistent mapping
# This ensures we have corresponding labels for each set of features
merged_data = pd.merge(features, labels, on='image_name')
merged_data.fillna(0, inplace=True)  # Fill NaN values with zeros

# Step 4: Preprocess the data for HMM training
# Extract feature matrix (excluding 'ImageName' and 'gt' columns)
X = merged_data.drop(columns=['image_name', 'gt']).values

# Scale features to mean 0 and variance 1 (to avoid numerical issues in HMM)
scaler = StandardScaler()
# Step 1: Drop non-numeric columns like 'color' if they exist in the features
X = merged_data.select_dtypes(include=[np.number]).values  # Only numeric data

# Step 2: Scale features to mean 0 and variance 1 (to avoid numerical issues in HMM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Map unique characters in ground truth ('gt') to integer values for training
unique_chars = sorted(set(''.join(merged_data['gt'].values)))
char_to_int = {char: idx for idx, char in enumerate(unique_chars)}
int_to_char = {idx: char for char, idx in char_to_int.items()}

# Convert 'gt' labels into sequences of integers for training
y_sequences = [[char_to_int[char] for char in label] for label in merged_data['gt'].values]

# Pad sequences to ensure consistent sequence lengths (fill with -1 for padding)
max_sequence_length = max(len(seq) for seq in y_sequences)
y_padded = np.array([seq + [-1] * (max_sequence_length - len(seq)) for seq in y_sequences])

# Step 5: Split the data into training and testing sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y_padded[:train_size], y_padded[train_size:]

# Step 6: Initialize and Train the HMM model using Baum-Welch algorithm
# Here, we define the HMM model with Gaussian emissions
n_states = 6  # Number of hidden states (adjust based on complexity)
hmm_model = hmm.GaussianHMM(n_components=n_states, covariance_type="spherical", n_iter=2000, init_params='')

# Set start and transition probabilities uniformly
hmm_model.startprob_ = np.full(n_states, 1.0 / n_states)
hmm_model.transmat_ = np.full((n_states, n_states), 1.0 / n_states)

# Baum-Welch algorithm for training the HMM
# This will optimize the HMM's parameters (transition and emission probabilities) based on training data
try:
    hmm_model.fit(X_train)
    print("Model successfully trained with Baum-Welch algorithm.")
except Exception as e:
    print(f"Error during training: {e}")

# -------------------------------------------------------
# TEST FEATURE PREDICTION PART (using DCT features)
# -------------------------------------------------------

# Function to predict character sequences using Viterbi algorithm
# The Viterbi algorithm is used here for decoding, providing the most likely sequence of hidden states
def predict_from_features(test_features):
    if test_features.shape[0] != X.shape[1]:  # Ensure correct number of features
        raise ValueError(f"Expected {X.shape[1]} features, but got {test_features.shape[0]}")

    # Reshape test features to match HMM input requirements
    test_features = test_features.reshape(1, -1)

    # Viterbi algorithm for decoding the most likely hidden state sequence
    # This outputs the sequence of states most likely responsible for the observed sequence
    try:
        predicted_states = hmm_model.predict(test_features)
        predicted_sequence = ''.join(int_to_char[state] for state in predicted_states if state != -1)
        print("Predicted sequence for the test features:", predicted_sequence)
    except ValueError as e:
        print(f"Prediction error: {e}")


    print("Predicted states:", predicted_states)

    predicted_sequence = ''.join(int_to_char[state] for state in predicted_states if state in int_to_char)

# Example: Use random test features (ensure it has the same length as X's feature dimension)
# Replace `test_features` with actual data for real predictions
test_features = np.random.rand(X.shape[1])  # Random example for testing purposes

# Run prediction function to obtain character sequence from test features
predict_from_features(test_features)


print("State to character mapping:", int_to_char)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Features file columns: Index(['Unnamed: 0',            0,            1,            2,            3,
                  4,            5,            6,            7,            8,
       ...
                192,          193,          194,          195,          196,
                197,          198,          199,        '0.1', 'image_name'],
      dtype='object', length=203)
Labels file columns: Index(['gt', 'image_name'], dtype='object')
Model successfully trained with Baum-Welch algorithm.
Predicted sequence for the test features: ം
Predicted states: [3]
State to character mapping: {0: ' ', 1: ':', 2: '\xa0', 3: 'ം', 4: 'ഃ', 5: 'അ', 6: 'ആ', 7: 'ഇ', 8: 'ഉ', 9: 'എ', 10: 'ഏ', 11: 'ഒ', 12: 'ഓ', 13: 'ക', 14: 'ഖ', 15: 'ഗ', 16: 'ഘ', 17: 'ങ', 18: 'ച', 19: 'ഛ', 20: 'ജ', 21: 'ഝ', 22: 'ഞ', 23: 'ട', 24: 'ഠ', 25: 'ഡ', 26: 'ഢ', 27: 'ണ', 28: 'ത', 29: 'ഥ', 30: 'ദ', 31: 'ധ', 32: 'ന', 33: 'പ', 34: 'ഫ', 35: 'ബ', 36: 'ഭ', 37: 'മ', 38: 'യ', 39: 'ര', 40: 'റ', 41: 'ല', 42: 'ള', 43: 'ഴ', 44: 'വ', 45: 'ശ', 46

In [None]:
# Install required libraries
!pip install pandas numpy scikit-learn hmmlearn opencv-python matplotlib openpyxl

# Required libraries for handling data and modeling
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.preprocessing import StandardScaler
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# Specify file names for easy reference
feature_file = '/content/drive/MyDrive/Colab Notebooks/SFR_DCT_LINE 2.xlsx'  # DCT features file (uploaded manually)
label_file = '/content/drive/MyDrive/Colab Notebooks/line_gt_1_1.xlsx'        # Label file (uploaded manually)


# Step 1: Load the feature and label data
features = pd.read_excel(feature_file)
labels = pd.read_excel(label_file)

# Display column names for both datasets
print("Features file columns:", features.columns)
print("Labels file columns:", labels.columns)


# Step 2: Merge features and labels on 'image_name' for consistent mapping
merged_data = pd.merge(features, labels, on='image_name')
merged_data.fillna(0, inplace=True)  # Fill NaN values with zeros

# Step 3: Preprocess the data for HMM training
# Drop 'image_name' and 'gt' columns to get only feature values
X = merged_data.drop(columns=['image_name', 'gt']).select_dtypes(include=[np.number]).values

# Scale features to mean 0 and variance 1 (to avoid numerical issues in HMM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 4: Map unique characters in 'gt' to integers for training
unique_chars = sorted(set(''.join(merged_data['gt'].values)))
char_to_int = {char: idx for idx, char in enumerate(unique_chars)}
int_to_char = {idx: char for char, idx in char_to_int.items()}

# Convert 'gt' labels into sequences of integers for training
y_sequences = [[char_to_int[char] for char in label] for label in merged_data['gt'].values]

# Pad sequences to ensure consistent sequence lengths (fill with -1 for padding)
max_sequence_length = max(len(seq) for seq in y_sequences)
y_padded = np.array([seq + [-1] * (max_sequence_length - len(seq)) for seq in y_sequences])

# Step 5: Split the data into training and testing sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y_padded[:train_size], y_padded[train_size:]

# Step 6: Initialize and Train the HMM model using Baum-Welch algorithm
n_states = 20  # Number of hidden states
hmm_model = hmm.GaussianHMM(n_components=n_states, covariance_type="spherical", n_iter=2000, init_params='')

# Set start and transition probabilities uniformly
hmm_model.startprob_ = np.full(n_states, 1.0 / n_states)
hmm_model.transmat_ = np.full((n_states, n_states), 1.0 / n_states)

# Train the HMM model
try:
    hmm_model.fit(X_train)
    print("Model successfully trained with Baum-Welch algorithm.")
except Exception as e:
    print(f"Error during training: {e}")

# -------------------------------------------------------
# Prediction Function for Character Sequences
# -------------------------------------------------------
def predict_from_sequence(test_sequence):
    if test_sequence.shape[1] != X.shape[1]:  # Ensure the feature dimension matches
        raise ValueError(f"Expected {X.shape[1]} features, but got {test_sequence.shape[1]}")

    # Predict the sequence of states for multiple samples
    try:
        predicted_states = hmm_model.predict(test_sequence)
        predicted_sequence = ''.join(int_to_char[state] for state in predicted_states if state in int_to_char)

        # Display results
        print("Predicted sequence for the test sequence:", predicted_sequence)
        print("Predicted states:", predicted_states)
    except ValueError as e:
        print(f"Prediction error: {e}")

# Run the prediction with a test sequence (e.g., first 10 samples of X_test)
test_sequence = X_test[:10]  # Using multiple rows to form a sequence

# Predict and display results
predict_from_sequence(test_sequence)

# Display state-to-character mapping for reference
print("State to character mapping:", int_to_char)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Features file columns: Index(['Unnamed: 0',            0,            1,            2,            3,
                  4,            5,            6,            7,            8,
       ...
                192,          193,          194,          195,          196,
                197,          198,          199,        '0.1', 'image_name'],
      dtype='object', length=203)
Labels file columns: Index(['gt', 'image_name'], dtype='object')
Model successfully trained with Baum-Welch algorithm.
Predicted sequence for the test sequence: അഏഎ ം:ം::ം
Predicted states: [ 5 10  9  2  3  1  3  1  1  3]
State to character mapping: {0: ' ', 1: ':', 2: '\xa0', 3: 'ം', 4: 'ഃ', 5: 'അ', 6: 'ആ', 7: 'ഇ', 8: 'ഉ', 9: 'എ', 10: 'ഏ', 11: 'ഒ', 12: 'ഓ', 13: 'ക', 14: 'ഖ', 15: 'ഗ', 16: 'ഘ', 17: 'ങ', 18: 'ച', 19: 'ഛ', 20: 'ജ', 21: 'ഝ', 22: 'ഞ', 23: 'ട', 24: 'ഠ', 25: 'ഡ', 26: 'ഢ', 27: 'ണ', 28: 'ത', 29: 'ഥ', 30: 'ദ', 31: 'ധ', 32: 'ന', 33: 'പ', 34: 'ഫ', 35: 'ബ', 36: 'ഭ', 37: 'മ', 38: 'യ', 39: 'ര', 40: 'റ', 41: 'ല', 4

In [None]:
# Install required libraries
!pip install pandas numpy scikit-learn hmmlearn opencv-python matplotlib openpyxl

# Required libraries for handling data and modeling
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.preprocessing import StandardScaler
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# Specify file names for easy reference
feature_file = '/content/drive/MyDrive/Colab Notebooks/SFR_DCT_LINE 2.xlsx'  # DCT features file (uploaded manually)
label_file = '/content/drive/MyDrive/Colab Notebooks/line_gt_1_1.xlsx'        # Label file (uploaded manually)


# Step 1: Load the feature and label data
features = pd.read_excel(feature_file)
labels = pd.read_excel(label_file)

# Display column names for both datasets
print("Features file columns:", features.columns)
print("Labels file columns:", labels.columns)


# Step 2: Merge features and labels on 'image_name' for consistent mapping
merged_data = pd.merge(features, labels, on='image_name')
merged_data.fillna(0, inplace=True)  # Fill NaN values with zeros

# Step 3: Preprocess the data for HMM training
# Drop 'image_name' and 'gt' columns to get only feature values
X = merged_data.drop(columns=['image_name', 'gt']).select_dtypes(include=[np.number]).values

# Scale features to mean 0 and variance 1 (to avoid numerical issues in HMM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 4: Map unique characters in 'gt' to integers for training
unique_chars = sorted(set(''.join(merged_data['gt'].values)))
char_to_int = {char: idx for idx, char in enumerate(unique_chars)}
int_to_char = {idx: char for char, idx in char_to_int.items()}

# Convert 'gt' labels into sequences of integers for training
y_sequences = [[char_to_int[char] for char in label] for label in merged_data['gt'].values]

# Pad sequences to ensure consistent sequence lengths (fill with -1 for padding)
max_sequence_length = max(len(seq) for seq in y_sequences)
y_padded = np.array([seq + [-1] * (max_sequence_length - len(seq)) for seq in y_sequences])

# Step 5: Split the data into training and testing sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y_padded[:train_size], y_padded[train_size:]

# Step 6: Initialize and Train the HMM model using Baum-Welch algorithm
n_states = 20  # Number of hidden states
hmm_model = hmm.GaussianHMM(n_components=n_states, covariance_type="spherical", n_iter=2000, init_params='')

# Set start and transition probabilities uniformly
hmm_model.startprob_ = np.full(n_states, 1.0 / n_states)
hmm_model.transmat_ = np.full((n_states, n_states), 1.0 / n_states)

# Train the HMM model
try:
    hmm_model.fit(X_train)
    print("Model successfully trained with Baum-Welch algorithm.")
except Exception as e:
    print(f"Error during training: {e}")

# -------------------------------------------------------
# Prediction Function for Character Sequences
# -------------------------------------------------------
def predict_from_sequence(test_sequence):
    if test_sequence.shape[1] != X.shape[1]:  # Ensure the feature dimension matches
        raise ValueError(f"Expected {X.shape[1]} features, but got {test_sequence.shape[1]}")

    # Predict the sequence of states for multiple samples
    try:
        predicted_states = hmm_model.predict(test_sequence)
        predicted_sequence = ''.join(int_to_char[state] for state in predicted_states if state in int_to_char)

        # Display results
        print("Predicted sequence for the test sequence:", predicted_sequence)
        print("Predicted states:", predicted_states)
    except ValueError as e:
        print(f"Prediction error: {e}")

# Run the prediction with a test sequence (e.g., first 10 samples of X_test)
test_sequence = X_test[:20]  # Using multiple rows to form a sequence

# Predict and display results
predict_from_sequence(test_sequence)

# Display state-to-character mapping for reference
print("State to character mapping:", int_to_char)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Features file columns: Index(['Unnamed: 0',            0,            1,            2,            3,
                  4,            5,            6,            7,            8,
       ...
                192,          193,          194,          195,          196,
                197,          198,          199,        '0.1', 'image_name'],
      dtype='object', length=203)
Labels file columns: Index(['gt', 'image_name'], dtype='object')
Model successfully trained with Baum-Welch algorithm.
Predicted sequence for the test sequence: ഒംഘക ഛഎഅഖഖഖഖഖഖ:ഃ
Predicted states: [11  3 16 13  2 19  9  5 14 14 14 14 14 14  1  4]
State to character mapping: {0: ' ', 1: ':', 2: '\xa0', 3: 'ം', 4: 'ഃ', 5: 'അ', 6: 'ആ', 7: 'ഇ', 8: 'ഉ', 9: 'എ', 10: 'ഏ', 11: 'ഒ', 12: 'ഓ', 13: 'ക', 14: 'ഖ', 15: 'ഗ', 16: 'ഘ', 17: 'ങ', 18: 'ച', 19: 'ഛ', 20: 'ജ', 21: 'ഝ', 22: 'ഞ', 23: 'ട', 24: 'ഠ', 2

In [None]:
print(test_sequence)

NameError: name 'test_sequence' is not defined

In [1]:
# Install required libraries
!pip install pandas numpy scikit-learn hmmlearn opencv-python matplotlib openpyxl

# Required libraries for handling data and modeling
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.preprocessing import StandardScaler
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# Specify file names for easy reference
feature_file = '/content/drive/MyDrive/Colab Notebooks/SFR_DCT_LINE 2.xlsx'  # DCT features file (uploaded manually)
label_file = '/content/drive/MyDrive/Colab Notebooks/line_gt_1_1.xlsx'        # Label file (uploaded manually)


# Step 1: Load the feature and label data
features = pd.read_excel(feature_file)
labels = pd.read_excel(label_file)

# Display column names for both datasets
print("Features file columns:", features.columns)
print("Labels file columns:", labels.columns)


# Step 2: Merge features and labels on 'image_name' for consistent mapping
merged_data = pd.merge(features, labels, on='image_name')
merged_data.fillna(0, inplace=True)  # Fill NaN values with zeros

# Step 3: Preprocess the data for HMM training
# Drop 'image_name' and 'gt' columns to get only feature values
X = merged_data.drop(columns=['image_name', 'gt']).select_dtypes(include=[np.number]).values

# Scale features to mean 0 and variance 1 (to avoid numerical issues in HMM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 4: Map unique characters in 'gt' to integers for training
unique_chars = sorted(set(''.join(merged_data['gt'].values)))
char_to_int = {char: idx for idx, char in enumerate(unique_chars)}
int_to_char = {idx: char for char, idx in char_to_int.items()}

# Convert 'gt' labels into sequences of integers for training
y_sequences = [[char_to_int[char] for char in label] for label in merged_data['gt'].values]

# Pad sequences to ensure consistent sequence lengths (fill with -1 for padding)
max_sequence_length = max(len(seq) for seq in y_sequences)
y_padded = np.array([seq + [-1] * (max_sequence_length - len(seq)) for seq in y_sequences])

# Step 5: Split the data into training and testing sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y_padded[:train_size], y_padded[train_size:]

# Step 6: Initialize and Train the HMM model using Baum-Welch algorithm
n_states = 20  # Number of hidden states
hmm_model = hmm.GaussianHMM(n_components=n_states, covariance_type="spherical", n_iter=2000, init_params='')

# Set start and transition probabilities uniformly
hmm_model.startprob_ = np.full(n_states, 1.0 / n_states)
hmm_model.transmat_ = np.full((n_states, n_states), 1.0 / n_states)

# Train the HMM model
try:
    hmm_model.fit(X_train)
    print("Model successfully trained with Baum-Welch algorithm.")
except Exception as e:
    print(f"Error during training: {e}")

# -------------------------------------------------------
# Prediction Function for Character Sequences
# -------------------------------------------------------
def predict_from_sequence(test_sequence):
    if test_sequence.shape[1] != X.shape[1]:  # Ensure the feature dimension matches
        raise ValueError(f"Expected {X.shape[1]} features, but got {test_sequence.shape[1]}")

    # Predict the sequence of states for multiple samples
    try:
        predicted_states = hmm_model.predict(test_sequence)
        predicted_sequence = ''.join(int_to_char[state] for state in predicted_states if state in int_to_char)

        # Display results
        print("Predicted sequence for the test sequence:", predicted_sequence)
        print("Predicted states:", predicted_states)
    except ValueError as e:
        print(f"Prediction error: {e}")

# Run the prediction with a test sequence (e.g., first 10 samples of X_test)
test_sequence = X_test[:20]  # Using multiple rows to form a sequence

# Predict and display results
predict_from_sequence(test_sequence)

# Display state-to-character mapping for reference
print("State to character mapping:", int_to_char)


Collecting hmmlearn
  Downloading hmmlearn-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.9 kB)
Downloading hmmlearn-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.1/161.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.2
Mounted at /content/drive
Features file columns: Index(['Unnamed: 0',            0,            1,            2,            3,
                  4,            5,            6,            7,            8,
       ...
                192,          193,          194,          195,          196,
                197,          198,          199,        '0.1', 'image_name'],
      dtype='object', length=203)
Labels file columns: Index(['gt', 'image_name'], dtype='object')
Model successfully trained with Baum-Welch algorithm.
Predicted sequence for the test sequence: ഘംക:എഅഅഅഅഅഅ

In [None]:
# Install required libraries
!pip install pandas numpy scikit-learn hmmlearn opencv-python matplotlib openpyxl

# Required libraries for handling data and modeling
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.preprocessing import StandardScaler
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# Specify file names for easy reference
feature_file = '/content/drive/MyDrive/Colab Notebooks/SFR_DCT_LINE 2.xlsx'  # DCT features file (uploaded manually)
label_file = '/content/drive/MyDrive/Colab Notebooks/line_gt_1_1.xlsx'        # Label file (uploaded manually)


# Step 1: Load the feature and label data
features = pd.read_excel(feature_file)
labels = pd.read_excel(label_file)

# Display column names for both datasets
print("Features file columns:", features.columns)
print("Labels file columns:", labels.columns)


# Step 2: Merge features and labels on 'image_name' for consistent mapping
merged_data = pd.merge(features, labels, on='image_name')
merged_data.fillna(0, inplace=True)  # Fill NaN values with zeros

# Step 3: Preprocess the data for HMM training
# Drop 'image_name' and 'gt' columns to get only feature values
X = merged_data.drop(columns=['image_name', 'gt']).select_dtypes(include=[np.number]).values

# Scale features to mean 0 and variance 1 (to avoid numerical issues in HMM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 4: Map unique characters in 'gt' to integers for training
unique_chars = sorted(set(''.join(merged_data['gt'].values)))
char_to_int = {char: idx for idx, char in enumerate(unique_chars)}
int_to_char = {idx: char for char, idx in char_to_int.items()}

# Convert 'gt' labels into sequences of integers for training
y_sequences = [[char_to_int[char] for char in label] for label in merged_data['gt'].values]

# Pad sequences to ensure consistent sequence lengths (fill with -1 for padding)
max_sequence_length = max(len(seq) for seq in y_sequences)
y_padded = np.array([seq + [-1] * (max_sequence_length - len(seq)) for seq in y_sequences])

# Step 5: Split the data into training and testing sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y_padded[:train_size], y_padded[train_size:]

# Step 6: Initialize and Train the HMM model using Baum-Welch algorithm
n_states = 20  # Number of hidden states
hmm_model = hmm.GaussianHMM(n_components=n_states, covariance_type="spherical", n_iter=2000, init_params='')

# Set start and transition probabilities uniformly
hmm_model.startprob_ = np.full(n_states, 1.0 / n_states)
hmm_model.transmat_ = np.full((n_states, n_states), 1.0 / n_states)

# Train the HMM model
try:
    hmm_model.fit(X_train)
    print("Model successfully trained with Baum-Welch algorithm.")
except Exception as e:
    print(f"Error during training: {e}")


# -------------------------------------------------------
# Adjusted Prediction Function for Full Sequence Row with 201 Timesteps
# -------------------------------------------------------
def predict_full_sequence(test_row):
    # Reshape each row into 201 timesteps with 1 feature each
    n_timesteps = 201  # Matches the number of features in each row
    test_sequence = test_row.reshape(n_timesteps, 1)

    # Predict the sequence of states for each timestep in the test sequence
    try:
        predicted_states = hmm_model.predict(test_sequence)

        # Map each predicted state to its character to form a full sequence
        predicted_sequence = ''.join(int_to_char[state] for state in predicted_states if state in int_to_char)

        # Display results
        print("Predicted sequence for the test row:", predicted_sequence)
        print("Predicted states:", predicted_states)
    except ValueError as e:
        print(f"Prediction error: {e}")

# Run the prediction with a single row (e.g., the first sample of X_test)
test_row = X_test[2]  # Using one row as the test feature

# Predict and display results
predict_full_sequence(test_row)

# Display state-to-character mapping for reference
print("State to character mapping:", int_to_char)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Features file columns: Index(['Unnamed: 0',            0,            1,            2,            3,
                  4,            5,            6,            7,            8,
       ...
                192,          193,          194,          195,          196,
                197,          198,          199,        '0.1', 'image_name'],
      dtype='object', length=203)
Labels file columns: Index(['gt', 'image_name'], dtype='object')
Model successfully trained with Baum-Welch algorithm.
Predicted sequence for the test row: ഘഇഉആഃഛഎഎഎഎഎഎഎഎംംംഎഎംഎഎഎഎഎംഎഎംംഎഎഎഎഎംഎഎഎഎഎഎഎഎഎംംഎംഎഎംംഎഎഎഎഎംംഎഎഎഎഎഎംംഎഎഎഎഎംഎഎംഎഎഎഎഎഎഎഎഎഎഎഎഎഎംംഎഎഎഎഎഎഎഎഎഎഎംഎഎഎഎഎഎഎഎഎംഎഎഎഎഎംംഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎംഎഎഎഎംഎഎഎഎഎഎംംഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎഎംഎഎഎഎംംഎഎഎഎഎഎഎഎം
Predicted states: [16  7  8  6  4 19  9  9  9  9  9  9  9  9  3  3  3  9  9  3  9  9  9  9
  9  3  9  9  3  3  9  9  9  9  9  3  9  9  9  9  9  9  9  9  9  3  3  9
  3  9  9  3  3  9  9  9  9  9  3  3  9  9  9  9  9  9  3  3  9  9  9  9
  9  3  9  9  3  9  9  9  9 