<a href="https://colab.research.google.com/github/maryamelnahas/CSEN711-MS1-Weir-Algorithm/blob/main/BINF711_MS1_Weir_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import re
from collections import Counter
import io

In [4]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving production.tsv to production (1).tsv
User uploaded file "production (1).tsv" with length 11818456 bytes


In [6]:


def load_and_clean_data(file_name: str) -> list:

    print(f"\n--- Starting Data Processing for '{file_name}' ---")

    try:
        df = pd.read_csv(file_name, sep='\t')
    except Exception as e:
        print(f"Initial pandas read failed: {e}")
        # Robust handling for files with internal encoding/quoting issues
        print("Attempting robust load by cleaning potential embedded characters...")

        # Read the file content manually
        with open(file_name, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()

        # Replace the problematic string '###COMMA###' seen in the data snippet
        # and any remaining quote characters that might interfere with TSV parsing.
        content_cleaned = content.replace('###COMMA###', '').replace('"', '').replace("'", "")

        # Use io.StringIO to read the cleaned string data into pandas
        df = pd.read_csv(io.StringIO(content_cleaned), sep='\t')

    print(f"Initial rows loaded: {len(df)}")

    # --- 2. Data Cleaning ---

    # Assuming the password column is the first one based on the TSV header:
    password_col_name = df.columns[0]
    print(f"Identified password column: '{password_col_name}'")

    # 2a. Handle Missing Values
    initial_rows = len(df)
    df.dropna(subset=[password_col_name], inplace=True)
    rows_after_drop = len(df)

    if initial_rows != rows_after_drop:
        print(f"Dropped {initial_rows - rows_after_drop} rows with missing passwords.")

    # 2b. Convert to String Type
    # Ensure all entries are treated as strings for uniform character analysis
    df[password_col_name] = df[password_col_name].astype(str)

    # 2c. Deduplication (Optional but recommended for PCFG training)
    # PCFG relies on counting patterns, so using all passwords is fine,
    # but cleaning up leading/trailing whitespace is crucial.
    df[password_col_name] = df[password_col_name].str.strip()

    # 2d. Final Extraction
    password_list = df[password_col_name].tolist()

    print(f"--- Data Loading and Cleaning Complete ---")
    print(f"Total cleaned passwords for training: {len(password_list)}")
    print(f"First 5 sample passwords: {password_list[:5]}")

    return password_list


Dataset = "production.tsv"

training_passwords = load_and_clean_data(Dataset)


--- Starting Data Processing for 'production.tsv' ---
Initial rows loaded: 300000
Identified password column: 'password'
--- Data Loading and Cleaning Complete ---
Total cleaned passwords for training: 300000
First 5 sample passwords: ['5Ka392f0c29', '97DMs07gBQ3qP9Hwf3YxJxY3yFvoh1W9b6HrmSNfOB9rjUIZJJX7g0mrv0Rlv51r184odr317jq0df3MBIJ4B07kMK7JCx4QG7', 'qD2bN8VmEB', 'rJu42Kx', 'YloGVMJoL']
