In [2]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_excel("C:/Users/ganes/OneDrive/Desktop/AI/Drug/Sequence/Sequence.xlsx")

# Function to clean Chromosome column
def clean_chromosome(chrom):
    match = re.match(r"chr(\d+|X|Y)", str(chrom))  # Extracts 'chr1', 'chr2', ..., 'chrX', 'chrY'
    return match.group(0) if match else "Other"  # Assign "Other" if no match

# Apply cleaning
df["Chromosome"] = df["Chromosome"].apply(clean_chromosome)

# Define a custom ordering for chromosomes
chromosome_order = {f"chr{i}": i for i in range(1, 23)}  # chr1 to chr22
chromosome_order.update({"chrX": 23, "chrY": 24, "Other": 25})  # Assign unique numbers to X, Y, and Other

# Apply custom encoding
df["Chromosome_Encoded"] = df["Chromosome"].map(chromosome_order)

# Drop any unwanted extra columns (fixes empty column issue)
df = df.loc[:, ~df.columns.str.contains('Unnamed')]

# Convert categorical columns to string and apply Label Encoding
label_encoders = {}  # Store label encoders if needed later
for col in ['Feature', 'Strand', 'Gene_ID', 'Transcript_ID']:
    df[col] = df[col].astype(str)  # Convert to string
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Apply Label Encoding
    label_encoders[col] = le  # Store the encoder for future reference

# Create a new column 'Length' by calculating the difference between 'end' and 'start'
df['Length'] = df['End'] - df['Start']

# Save the cleaned and encoded data to a new Excel file
output_path = "C:/Users/ganes/OneDrive/Desktop/AI/Drug/Sequence/Sequence_Filtered_Cleaned.xlsx"
df.to_excel(output_path, index=False)

print(f"File saved successfully at: {output_path}")


File saved successfully at: C:/Users/ganes/OneDrive/Desktop/AI/Drug/Sequence/Sequence_Filtered_Cleaned.xlsx
