In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

# Load and label your data files
# NOTE: You must have uploaded these files to Colab first
df_phone = pd.read_csv('phone.csv', names=['value', 'format_type']).dropna()
df_phone['label'] = 'Phone Number'

df_company = pd.read_csv('company.csv', names=['value']).dropna()
df_company['label'] = 'Company Name'

df_country = pd.read_csv('countries.txt', names=['value']).dropna()
df_country['label'] = 'Country'

df_dates = pd.read_csv('dates.csv', names=['value']).dropna()
df_dates['label'] = 'Date'

# Combine all data into one DataFrame
training_data = pd.concat([df_phone[['value', 'label']], df_company, df_country, df_dates], ignore_index=True)
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)

print("Combined Training Data:")
print(training_data.head())

# Feature Engineering
# Create a robust set of features from the text data
def create_features(df):
    df['text_length'] = df['value'].apply(lambda x: len(str(x)))
    df['num_digits'] = df['value'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df['num_letters'] = df['value'].apply(lambda x: sum(c.isalpha() for c in str(x)))
    df['num_spaces'] = df['value'].apply(lambda x: sum(c.isspace() for c in str(x)))
    df['has_plus'] = df['value'].apply(lambda x: '+' in str(x))
    df['has_paren'] = df['value'].apply(lambda x: '(' in str(x) or ')' in str(x))
    df['has_hyphen'] = df['value'].apply(lambda x: '-' in str(x))
    df['has_slash'] = df['value'].apply(lambda x: '/' in str(x))
    df['has_dot'] = df['value'].apply(lambda x: '.' in str(x))
    return df

training_data_features = create_features(training_data.copy())

# Separate features (X) and labels (y)
feature_cols = ['text_length', 'num_digits', 'num_letters', 'num_spaces', 'has_plus', 'has_paren', 'has_hyphen', 'has_slash', 'has_dot']
X = training_data_features[feature_cols]
y = training_data_features['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nFeatures Created:")
print(X_train.head())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Train the model
# The max_iter is increased to ensure convergence on a large dataset
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred = model.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Save the trained model and feature list for later use in predict.py
joblib.dump(model, 'semantic_model.pkl')
joblib.dump(feature_cols, 'feature_cols.pkl')

print("\nModel and feature list saved as 'semantic_model.pkl' and 'feature_cols.pkl'")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np

# Load the list of countries
df_country_list = pd.read_csv('countries.txt', names=['country_name'])
COUNTRY_LIST = {name.lower() for name in df_country_list['country_name']}

# Load and label your training data
# NOTE: You must have uploaded these files to Colab first
df_phone = pd.read_csv('phone.csv', names=['value', 'format_type']).dropna()
df_phone['label'] = 'Phone Number'

df_company = pd.read_csv('company.csv', names=['value']).dropna()
df_company['label'] = 'Company Name'

df_country = pd.read_csv('countries.txt', names=['value']).dropna()
df_country['label'] = 'Country'

df_dates = pd.read_csv('dates.csv', names=['value']).dropna()
df_dates['label'] = 'Date'

# Combine all data into one DataFrame
training_data = pd.concat([df_phone[['value', 'label']], df_company, df_country, df_dates], ignore_index=True)
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Feature Engineering
def create_features(df):
    df['text_length'] = df['value'].apply(lambda x: len(str(x)))
    df['num_digits'] = df['value'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df['num_letters'] = df['value'].apply(lambda x: sum(c.isalpha() for c in str(x)))
    df['num_spaces'] = df['value'].apply(lambda x: sum(c.isspace() for c in str(x)))
    df['has_plus'] = df['value'].apply(lambda x: '+' in str(x))
    df['has_paren'] = df['value'].apply(lambda x: '(' in str(x) or ')' in str(x))
    df['has_hyphen'] = df['value'].apply(lambda x: '-' in str(x))
    df['has_slash'] = df['value'].apply(lambda x: '/' in str(x))
    df['has_dot'] = df['value'].apply(lambda x: '.' in str(x))
    # --- New Feature ---
    df['is_in_country_list'] = df['value'].apply(lambda x: str(x).lower() in COUNTRY_LIST)
    return df

training_data_features = create_features(training_data.copy())

# Separate features (X) and labels (y)
feature_cols = ['text_length', 'num_digits', 'num_letters', 'num_spaces', 'has_plus', 'has_paren', 'has_hyphen', 'has_slash', 'has_dot', 'is_in_country_list']
X = training_data_features[feature_cols]
y = training_data_features['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nFeatures Created (with new 'is_in_country_list' feature):")
print(X_train.head())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Train the model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred = model.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Save the updated model and feature list
joblib.dump(model, 'semantic_model.pkl')
joblib.dump(feature_cols, 'feature_cols.pkl')

print("\nUpdated model and feature list saved.")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Train the model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred = model.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Save the updated model and feature list
joblib.dump(model, 'semantic_model.pkl')
joblib.dump(feature_cols, 'feature_cols.pkl')

print("\nUpdated model and feature list saved.")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import re
import numpy as np

# ===============================
# 1. Load Data and Create Features
# ===============================

# Your complete country list (as provided)
COUNTRIES = {
    "afghanistan", "albania", "algeria", "andorra", "angola", "antigua and barbuda",
    "argentina", "armenia", "aruba", "australia", "austria", "azerbaijan",
    "bahamas", "bahrain", "bangladesh", "barbados", "belarus", "belgium",
    "belize", "benin", "bhutan", "bolivia", "bosnia and herzegovina", "botswana",
    "brazil", "brunei", "bulgaria", "burkina faso", "burma", "burundi",
    "cambodia", "cameroon", "canada", "cape verde", "central african republic",
    "chad", "chile", "china", "colombia", "comoros", "costa rica", "cote d'ivoire",
    "croatia", "cuba", "curacao", "cyprus", "czech republic",
    "democratic republic of the congo", "denmark", "djibouti", "dominica",
    "dominican republic", "east timor", "ecuador", "egypt", "el salvador",
    "equatorial guinea", "eritrea", "estonia", "ethiopia", "fiji", "finland",
    "france", "gabon", "gambia", "georgia", "germany", "ghana", "greece",
    "grenada", "guatemala", "guinea", "guinea bissau", "guyana", "haiti",
    "holy see", "honduras", "hong kong", "hungary", "iceland", "india",
    "indonesia", "iran", "iraq", "ireland", "israel", "italy", "jamaica",
    "japan", "jordan", "kazakhstan", "kenya", "kiribati", "kosovo", "kuwait",
    "kyrgyzstan", "laos", "latvia", "lebanon", "lesotho", "liberia", "libya",
    "liechtenstein", "lithuania", "luxembourg", "macau", "macedonia",
    "madagascar", "malawi", "malaysia", "maldives", "mali", "malta",
    "marshall islands", "mauritania", "mauritius", "mexico", "micronesia",
    "moldova", "monaco", "mongolia", "montenegro", "morocco", "mozambique",
    "namibia", "nauru", "nepal", "netherlands", "netherlands antilles",
    "new zealand", "nicaragua", "niger", "nigeria", "north korea", "norway",
    "oman", "pakistan", "palau", "palestinian territories", "panama",
    "papua new guinea", "paraguay", "peru", "philippines", "poland", "portugal",
    "qatar", "republic of the congo", "romania", "russia", "rwanda",
    "saint kitts and nevis", "saint lucia", "saint vincent and the grenadines",
    "samoa", "san marino", "sao tome and principe", "saudi arabia", "senegal",
    "serbia", "seychelles", "sierra leone", "singapore", "sint maarten",
    "slovakia", "slovenia", "solomon islands", "somalia", "south africa",
    "south korea", "south sudan", "spain", "sri lanka", "sudan", "suriname",
    "swaziland", "sweden", "switzerland", "syria", "taiwan", "tajikistan",
    "tanzania", "thailand", "timor leste", "togo", "tonga", "trinidad and tobago",
    "tunisia", "turkey", "turkmenistan", "tuvalu", "uganda", "ukraine",
    "united arab emirates", "united kingdom", "united states", "uruguay",
    "uzbekistan", "vanuatu", "venezuela", "vietnam", "yemen", "zambia", "zimbabwe"
}

# Load and label your training data
df_phone = pd.read_csv('phone.csv', names=['value', 'format_type']).dropna()
df_phone['label'] = 'Phone Number'
df_company = pd.read_csv('company.csv', names=['value']).dropna()
df_company['label'] = 'Company Name'
df_country = pd.read_csv('countries.txt', names=['value']).dropna()
df_country['label'] = 'Country'
df_dates = pd.read_csv('dates.csv', names=['value']).dropna()
df_dates['label'] = 'Date'

# Combine all data into one DataFrame
training_data = pd.concat([df_phone[['value', 'label']], df_company, df_country, df_dates], ignore_index=True)
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Feature Engineering
def create_features(df):
    df['text_length'] = df['value'].apply(lambda x: len(str(x)))
    df['num_digits'] = df['value'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df['num_letters'] = df['value'].apply(lambda x: sum(c.isalpha() for c in str(x)))
    df['num_spaces'] = df['value'].apply(lambda x: sum(c.isspace() for c in str(x)))
    df['has_plus'] = df['value'].apply(lambda x: '+' in str(x))
    df['has_paren'] = df['value'].apply(lambda x: '(' in str(x) or ')' in str(x))
    df['has_hyphen'] = df['value'].apply(lambda x: '-' in str(x))
    df['has_slash'] = df['value'].apply(lambda x: '/' in str(x))
    df['has_dot'] = df['value'].apply(lambda x: '.' in str(x))
    df['is_in_country_list'] = df['value'].apply(lambda x: str(x).lower() in COUNTRIES)
    return df

training_data_features = create_features(training_data.copy())

# Separate features (X) and labels (y)
feature_cols = ['text_length', 'num_digits', 'num_letters', 'num_spaces', 'has_plus', 'has_paren', 'has_hyphen', 'has_slash', 'has_dot', 'is_in_country_list']
X = training_data_features[feature_cols]
y = training_data_features['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ===============================
# 2. Train and Evaluate the Model
# ===============================

# Train the model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred = model.predict(X_val)
print("Final Classification Report:")
print(classification_report(y_val, y_pred))

# Save the updated model and feature list
joblib.dump(model, 'semantic_model.pkl')
joblib.dump(feature_cols, 'feature_cols.pkl')

print("\nUpdated model and feature list saved.")

In [None]:
%%writefile predict.py

import sys
import os
import argparse
import pandas as pd
import joblib
import re

# ===============================
# Country list (directly encoded)
# ===============================
COUNTRIES = {
    "afghanistan", "albania", "algeria", "andorra", "angola", "antigua and barbuda",
    "argentina", "armenia", "aruba", "australia", "austria", "azerbaijan",
    "bahamas", "bahrain", "bangladesh", "barbados", "belarus", "belgium",
    "belize", "benin", "bhutan", "bolivia", "bosnia and herzegovina", "botswana",
    "brazil", "brunei", "bulgaria", "burkina faso", "burma", "burundi",
    "cambodia", "cameroon", "canada", "cape verde", "central african republic",
    "chad", "chile", "china", "colombia", "comoros", "costa rica", "cote d'ivoire",
    "croatia", "cuba", "curacao", "cyprus", "czech republic",
    "democratic republic of the congo", "denmark", "djibouti", "dominica",
    "dominican republic", "east timor", "ecuador", "egypt", "el salvador",
    "equatorial guinea", "eritrea", "estonia", "ethiopia", "fiji", "finland",
    "france", "gabon", "gambia", "georgia", "germany", "ghana", "greece",
    "grenada", "guatemala", "guinea", "guinea bissau", "guyana", "haiti",
    "holy see", "honduras", "hong kong", "hungary", "iceland", "india",
    "indonesia", "iran", "iraq", "ireland", "israel", "italy", "jamaica",
    "japan", "jordan", "kazakhstan", "kenya", "kiribati", "kosovo", "kuwait",
    "kyrgyzstan", "laos", "latvia", "lebanon", "lesotho", "liberia", "libya",
    "liechtenstein", "lithuania", "luxembourg", "macau", "macedonia",
    "madagascar", "malawi", "malaysia", "maldives", "mali", "malta",
    "marshall islands", "mauritania", "mauritius", "mexico", "micronesia",
    "moldova", "monaco", "mongolia", "montenegro", "morocco", "mozambique",
    "namibia", "nauru", "nepal", "netherlands", "netherlands antilles",
    "new zealand", "nicaragua", "niger", "nigeria", "north korea", "norway",
    "oman", "pakistan", "palau", "palestinian territories", "panama",
    "papua new guinea", "paraguay", "peru", "philippines", "poland", "portugal",
    "qatar", "republic of the congo", "romania", "russia", "rwanda",
    "saint kitts and nevis", "saint lucia", "saint vincent and the grenadines",
    "samoa", "san marino", "sao tome and principe", "saudi arabia", "senegal",
    "serbia", "seychelles", "sierra leone", "singapore", "sint maarten",
    "slovakia", "slovenia", "solomon islands", "somalia", "south africa",
    "south korea", "south sudan", "spain", "sri lanka", "sudan", "suriname",
    "swaziland", "sweden", "switzerland", "syria", "taiwan", "tajikistan",
    "tanzania", "thailand", "timor leste", "togo", "tonga", "trinidad and tobago",
    "tunisia", "turkey", "turkmenistan", "tuvalu", "uganda", "ukraine",
    "united arab emirates", "united kingdom", "united states", "uruguay",
    "uzbekistan", "vanuatu", "venezuela", "vietnam", "yemen", "zambia", "zimbabwe"
}


# ===============================
# Feature creation (updated)
# ===============================
def create_features(df):
    df['text_length'] = df['value'].apply(lambda x: len(str(x)))
    df['num_digits'] = df['value'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df['num_letters'] = df['value'].apply(lambda x: sum(c.isalpha() for c in str(x)))
    df['num_spaces'] = df['value'].apply(lambda x: sum(c.isspace() for c in str(x)))
    df['has_plus'] = df['value'].apply(lambda x: '+' in str(x))
    df['has_paren'] = df['value'].apply(lambda x: '(' in str(x) or ')' in str(x))
    df['has_hyphen'] = df['value'].apply(lambda x: '-' in str(x))
    df['has_slash'] = df['value'].apply(lambda x: '/' in str(x))
    df['has_dot'] = df['value'].apply(lambda x: '.' in str(x))
    # New feature to directly check for countries
    df['is_in_country_list'] = df['value'].apply(lambda x: str(x).lower() in COUNTRIES)
    return df

# ===============================
# New classification function
# ===============================
def classify_column_ml(column: pd.Series):
    df_to_predict = pd.DataFrame({'value': column.dropna()})
    if df_to_predict.empty:
        return {"prediction": "Other"}

    df_features = create_features(df_to_predict)

    # Correct feature columns must match what the model was trained on
    feature_cols = ['text_length', 'num_digits', 'num_letters', 'num_spaces',
                    'has_plus', 'has_paren', 'has_hyphen', 'has_slash', 'has_dot', 'is_in_country_list']
    X_predict = df_features[feature_cols]

    # Get predictions
    predictions = model.predict(X_predict)

    # Determine the most common prediction and its frequency
    prediction_counts = pd.Series(predictions).value_counts(normalize=True)
    most_common_pred = prediction_counts.idxmax()
    confidence = prediction_counts.max()

    return {"prediction": most_common_pred, "scores": {most_common_pred: confidence}}

# ===============================
# Main Execution Logic (remains the same)
# ===============================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Performs semantic classification using a trained ML model."
    )
    parser.add_argument(
        "--input",
        type=str,
        required=True,
        help="The path to the input CSV file."
    )
    parser.add_argument(
        "--column",
        type=str,
        required=True,
        help="The name of the column to classify."
    )

    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: The file '{args.input}' does not exist.")
        sys.exit(1)

    try:
        df = pd.read_csv(args.input)
    except Exception as e:
        print(f"Error: Failed to read the file. Details: {e}")
        sys.exit(1)

    if args.column not in df.columns:
        print(f"Error: The column '{args.column}' was not found in the file.")
        sys.exit(1)

    # Load the trained model and feature list from the previous steps
    try:
        model = joblib.load('semantic_model.pkl')
    except FileNotFoundError:
        print("Error: Trained model 'semantic_model.pkl' not found. Please run the training steps first.")
        sys.exit(1)

    result = classify_column_ml(df[args.column])

    print(f"Input File: {args.input}")
    print(f"Column Name: {args.column}")
    print("--- Classification Result (ML) ---")
    print(f"Prediction: {result['prediction']}")
    print(f"Scores: {result['scores']}")

In [None]:
# Example command
!python predict.py --input phone.csv --column number

In [None]:
!python predict.py --input company.csv --"company"

In [None]:
!python predict.py --input phone.csv --column "Phone Number"

In [None]:
!python predict.py --input phone.csv --column "number"

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import re
import numpy as np

# ===============================================
# 1. Load Data and Create Features (Country removed)
# ===============================================

# Your complete country list (used for feature engineering)
COUNTRIES = {
    "afghanistan", "albania", "algeria", "andorra", "angola", "antigua and barbuda",
    "argentina", "armenia", "aruba", "australia", "austria", "azerbaijan",
    "bahamas", "bahrain", "bangladesh", "barbados", "belarus", "belgium",
    "belize", "benin", "bhutan", "bolivia", "bosnia and herzegovina", "botswana",
    "brazil", "brunei", "bulgaria", "burkina faso", "burma", "burundi",
    "cambodia", "cameroon", "canada", "cape verde", "central african republic",
    "chad", "chile", "china", "colombia", "comoros", "costa rica", "cote d'ivoire",
    "croatia", "cuba", "curacao", "cyprus", "czech republic",
    "democratic republic of the congo", "denmark", "djibouti", "dominica",
    "dominican republic", "east timor", "ecuador", "egypt", "el salvador",
    "equatorial guinea", "eritrea", "estonia", "ethiopia", "fiji", "finland",
    "france", "gabon", "gambia", "georgia", "germany", "ghana", "greece",
    "grenada", "guatemala", "guinea", "guinea bissau", "guyana", "haiti",
    "holy see", "honduras", "hong kong", "hungary", "iceland", "india",
    "indonesia", "iran", "iraq", "ireland", "israel", "italy", "jamaica",
    "japan", "jordan", "kazakhstan", "kenya", "kiribati", "kosovo", "kuwait",
    "kyrgyzstan", "laos", "latvia", "lebanon", "lesotho", "liberia", "libya",
    "liechtenstein", "lithuania", "luxembourg", "macau", "macedonia",
    "madagascar", "malawi", "malaysia", "maldives", "mali", "malta",
    "marshall islands", "mauritania", "mauritius", "mexico", "micronesia",
    "moldova", "monaco", "mongolia", "montenegro", "morocco", "mozambique",
    "namibia", "nauru", "nepal", "netherlands", "netherlands antilles",
    "new zealand", "nicaragua", "niger", "nigeria", "north korea", "norway",
    "oman", "pakistan", "palau", "palestinian territories", "panama",
    "papua new guinea", "paraguay", "peru", "philippines", "poland", "portugal",
    "qatar", "republic of the congo", "romania", "russia", "rwanda",
    "saint kitts and nevis", "saint lucia", "saint vincent and the grenadines",
    "samoa", "san marino", "sao tome and principe", "saudi arabia", "senegal",
    "serbia", "seychelles", "sierra leone", "singapore", "sint maarten",
    "slovakia", "slovenia", "solomon islands", "somalia", "south africa",
    "south korea", "south sudan", "spain", "sri lanka", "sudan", "suriname",
    "swaziland", "sweden", "switzerland", "syria", "taiwan", "tajikistan",
    "tanzania", "thailand", "timor leste", "togo", "tonga", "trinidad and tobago",
    "tunisia", "turkey", "turkmenistan", "tuvalu", "uganda", "ukraine",
    "united arab emirates", "united kingdom", "united states", "uruguay",
    "uzbekistan", "vanuatu", "venezuela", "vietnam", "yemen", "zambia", "zimbabwe"
}


# Load and label your training data
df_phone = pd.read_csv('phone.csv', names=['value', 'format_type']).dropna()
df_phone['label'] = 'Phone Number'
df_company = pd.read_csv('company.csv', names=['value']).dropna()
df_company['label'] = 'Company Name'
df_dates = pd.read_csv('dates.csv', names=['value']).dropna()
df_dates['label'] = 'Date'

# Combine the data, excluding the problematic 'Country' data
training_data = pd.concat([df_phone[['value', 'label']], df_company, df_dates], ignore_index=True)
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Feature Engineering
def create_features(df):
    df['text_length'] = df['value'].apply(lambda x: len(str(x)))
    df['num_digits'] = df['value'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df['num_letters'] = df['value'].apply(lambda x: sum(c.isalpha() for c in str(x)))
    df['num_spaces'] = df['value'].apply(lambda x: sum(c.isspace() for c in str(x)))
    df['has_plus'] = df['value'].apply(lambda x: '+' in str(x))
    df['has_paren'] = df['value'].apply(lambda x: '(' in str(x) or ')' in str(x))
    df['has_hyphen'] = df['value'].apply(lambda x: '-' in str(x))
    df['has_slash'] = df['value'].apply(lambda x: '/' in str(x))
    df['has_dot'] = df['value'].apply(lambda x: '.' in str(x))
    df['is_in_country_list'] = df['value'].apply(lambda x: str(x).lower() in COUNTRIES)
    return df

training_data_features = create_features(training_data.copy())

# Separate features (X) and labels (y)
feature_cols = ['text_length', 'num_digits', 'num_letters', 'num_spaces', 'has_plus', 'has_paren', 'has_hyphen', 'has_slash', 'has_dot', 'is_in_country_list']
X = training_data_features[feature_cols]
y = training_data_features['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ===============================
# 2. Train and Evaluate the Model
# ===============================

# Train the model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred = model.predict(X_val)
print("Final Classification Report (without 'Country' as a class):")
print(classification_report(y_val, y_pred))

# Save the updated model and feature list for your new predict.py
joblib.dump(model, 'semantic_model_no_country.pkl')
joblib.dump(feature_cols, 'feature_cols_no_country.pkl')

print("\nUpdated model and feature list saved.")

In [None]:
%%writefile parser.py

import sys
import os
import argparse
import pandas as pd
import joblib
import re

# ==================================
# Import the classification pipeline
# Make sure predict.py is in the same directory
# ==================================
try:
    from predict import classify_column_ml, create_features, COUNTRIES
except ImportError:
    print("Error: 'predict.py' not found. Please ensure it is in the same directory and contains classify_column_ml, create_features, and COUNTRIES.")
    sys.exit(1)

# ==================================
# Country Code Mapping (extendable)
# ==================================
COUNTRY_CODE_MAP = {
    "91": "India",
    "1": "United States",
    "44": "United Kingdom",
    "49": "Germany",
    "33": "France",
    "39": "Italy",
    "81": "Japan",
    "86": "China",
    "7": "Russia",
    "61": "Australia",
    "55": "Brazil",
    "27": "South Africa",
    "34": "Spain",
    "82": "South Korea",
    # Add more as needed
}

# ==================================
# Legal Suffixes from legal.txt
# ==================================
LEGAL_TERMS_CONTENT = """
a spol
aat
aansprakelijkheid
ab
actien gesellschaft
actiengesellschaft
actions
ad
ae
ag
agreement
aj
akc spol
akciova
aktiebolag
aktien
andelsselskab
allmennaksjeselskap
anonimi
aksjeselskap
aktiengesellschaft
aktzii
amba
anonim ortakligi
anonim sirketi
anpartsselskab
ans
ansvar
ansvarlig
ao
aps
as
asa
auf
av
avoin
ay
begraenset
beperkta
berhad
beschrankter
besloten
beteti
bhd
bt
bv
bvba
cb
cic
cl
co
commandite
commanditaire
community
commv
compagnie
company
cooperatief ua
cooperatieve
cooperative
corp
corporation
cpt
cuideachta
cv
da
dat
dd
de
dinteret
dionicko
dno
doo
dooel
drustvo
druzhestvo
economique
ee
en
de
por
einkahlutafelag
ehf
entreprise
esv
etaireia
eurl
fdn
felelossegu
foundation
fp
free
fz
fzco
fze
general partnership
gesellschaft
gie
gmbh
gmbh co kg
gp
groupement
gte
haftung
haftungsbeschrankt
handelsbolag
hb
helseforetak
hf
inc
incorporated
interest
ipjsc
is
javno
jtd
jawna
kb
kd
kda
kft
kg
komanditsabiedriba
komandytne
kommanditgesellschaft
kgaa
kht
kommandiittiyhtiö
kommandit
kozhasznu
kozos
kommanditbolag
korlatolt
ks
kt
kv
ky
lc
lda
limitada
limitata
limited
limitee
limited liability
llc
lllp
llp
lp
lt
ltd
ltda
ltee
lv
mb
mbh
mchj
med
met
mit
nl
nuf
nv
nyrt
oaj
oao
od
odgovornoscu
odgovornost
odpowiedzialnoscia
oe
offene
og
one person
ood
ooo
opc
ogranicena
ogranicenom
ograniczona
osakeyhtio
oy
oyj
pa
partnerska
pc
per
phoibli
pjs
plc
pllc
plt
pp
ppa
private
privee
professional
proprietary
ps
pte
pto
pty
pty ltd
public
public joint stock company
pvt
qk
responsabilite
responsabilita
rt
s de rl
s en c
sa
sae
sal
saoc
saog
sapa
sar
sarl
sas
sasu
sca
scpa
scra
scs
sd
sdn
sdn bhd
se
secs
selskap
sendirin
ses
sf
sgp
sha
sia
sicar
sicav
simplifiee
ska
sl
slp
slne
smba
smcprivate
smcpvt
smpc
snc
soccol
sociedad anonima operadora
societa per azioni
societe anonyme
sp z oo
sp zoo
spzoo
spa
spj
spk
spol s ro
spolecnost
spolka zoo
spp
sprl
srl
sro
ss
stjornarvold
stg
tapui
tarsasag
teo
theoranta
tov
tovarystvo
trgovacko
obch spol
uab
ug
ultd
unipersonnelle
unlimited
unltd
vallalat
vat
vennootschap
verwaltungsgesellschaft
vof
vos
vzw
xk
yhtio
yoaj
zat
zone
zrt
kscp
ab publ
ab public
qpsc
company qpsc
sjsc
co sjsc
pjsc
pcl
public company limited
saa
sai
tas
corporation sjsc
bsc
abp
publikt aktiebolag
saic
sa esp
a s
co kscp
company kscp
saf
real estate investment trust
reit
saa
pjsc
kk
kk
kabushiki kaisha
sociedad anonima bursatil de capital variable
sab de cv
sab
joint stock company
jsc
company jsc
saai
sociedad anonima agricola industrial
sacifia
anonima comercial industrial financiera inmobiliaria y agropecuaria
saci
sakp
as
tao
bancorp
bancorporation
fc spa
football club spa
shpk
shk
shoqeri
pergjegjesi
kufizuar
aksionere
komandite
kolektive
dege
zyre
perfaqesimit
responsabilidad
comandita simple
acciones
colectiva
capital industria
estado
garantia
reciproca
simplificada
unipersonal
soc col
scei
sgr
sau
ilp
akcionarsko
neogranicenom
solidarnom
komanditno
samostalni
preduzetnik
sociedade
simples
coletivo
cooperativa
publica
privada
publico
eirl
empresa
individual
sc
"""
LEGAL_TERMS = {t.strip().lower() for t in LEGAL_TERMS_CONTENT.strip().split("\n")}

# ==================================
# Helper: Phone Number Parsing
# ==================================
def parse_phone_number(phone):
    phone = str(phone).strip()
    country, number = None, None
    clean_phone = re.sub(r"[^\d+]", "", phone)

    # Check for a country code at the beginning
    if clean_phone.startswith('+'):
        for code, country_name in COUNTRY_CODE_MAP.items():
            if clean_phone.startswith('+' + code):
                country = country_name
                number = clean_phone[1+len(code):]
                return country, number

    return None, phone

# ==================================
# Helper: Company Name Parsing
# ==================================
def parse_company_name(company):
    company = str(company).strip()
    lower_company = company.lower()

    # Sort legal suffixes by length descending to match longer terms first
    sorted_legal_terms = sorted(list(LEGAL_TERMS), key=len, reverse=True)

    for suffix in sorted_legal_terms:
        if lower_company.endswith(f" {suffix}"):
            idx = lower_company.rfind(f" {suffix}")
            name = company[:idx].strip()
            legal = company[idx:].strip()
            return name, legal
    return company, ""

# ==================================
# Main Parsing Logic
# ==================================
def process_file(input_file):
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        print(f"Error: Failed to read file. {e}")
        sys.exit(1)

    parsed_cols = []

    # Identify all columns to be parsed
    for col in df.columns:
        result = classify_column_ml(df[col])
        pred = result["prediction"]
        conf = list(result["scores"].values())[0] if "scores" in result and result["scores"] else 0

        # Check if the confidence for a specific prediction is above a threshold
        if pred in ["Phone Number", "Company Name"] and conf > 0.5:
            parsed_cols.append((col, pred, conf))

    if not parsed_cols:
        print("No PhoneNumber or CompanyName column detected with sufficient confidence.")
        sys.exit(0)

    # Sort to find the best candidate if needed
    parsed_cols.sort(key=lambda x: x[2], reverse=True)

    print("Detected columns for parsing:")
    for col, pred, conf in parsed_cols:
        print(f"- Column '{col}': {pred} (confidence={conf:.2f})")

    output_df = pd.DataFrame()

    # Process all detected columns, as required by the problem statement
    for col, pred, conf in parsed_cols:
        if pred == "Phone Number":
            output_df["PhoneNumber"] = df[col]
            parsed = df[col].apply(lambda x: pd.Series(parse_phone_number(x)))
            parsed.columns = ["Country", "Number"]
            output_df = pd.concat([output_df, parsed], axis=1)

        elif pred == "Company Name":
            output_df["CompanyName"] = df[col]
            parsed = df[col].apply(lambda x: pd.Series(parse_company_name(x)))
            parsed.columns = ["Name", "Legal"]
            output_df = pd.concat([output_df, parsed], axis=1)

    output_df.to_csv("output.csv", index=False)
    print("\nParsing complete. Output written to output.csv")

# ==================================
# CLI
# ==================================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Parses PhoneNumber/CompanyName columns into normalized fields."
    )
    parser.add_argument(
        "--input",
        type=str,
        required=True,
        help="Path to input CSV file"
    )

    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: File '{args.input}' does not exist.")
        sys.exit(1)

    process_file(args.input)

In [None]:
!python parser.py --input phone.csv

In [None]:
%%writefile predict.py

import sys
import os
import argparse
import pandas as pd
import joblib
import re

# ===============================
# Country list (directly encoded)
# ===============================
COUNTRIES = {
    "afghanistan", "albania", "algeria", "andorra", "angola", "antigua and barbuda",
    "argentina", "armenia", "aruba", "australia", "austria", "azerbaijan",
    "bahamas", "bahrain", "bangladesh", "barbados", "belarus", "belgium",
    "belize", "benin", "bhutan", "bolivia", "bosnia and herzegovina", "botswana",
    "brazil", "brunei", "bulgaria", "burkina faso", "burma", "burundi",
    "cambodia", "cameroon", "canada", "cape verde", "central african republic",
    "chad", "chile", "china", "colombia", "comoros", "costa rica", "cote d'ivoire",
    "croatia", "cuba", "curacao", "cyprus", "czech republic",
    "democratic republic of the congo", "denmark", "djibouti", "dominica",
    "dominican republic", "east timor", "ecuador", "egypt", "el salvador",
    "equatorial guinea", "eritrea", "estonia", "ethiopia", "fiji", "finland",
    "france", "gabon", "gambia", "georgia", "germany", "ghana", "greece",
    "grenada", "guatemala", "guinea", "guinea bissau", "guyana", "haiti",
    "holy see", "honduras", "hong kong", "hungary", "iceland", "india",
    "indonesia", "iran", "iraq", "ireland", "israel", "italy", "jamaica",
    "japan", "jordan", "kazakhstan", "kenya", "kiribati", "kosovo", "kuwait",
    "kyrgyzstan", "laos", "latvia", "lebanon", "lesotho", "liberia", "libya",
    "liechtenstein", "lithuania", "luxembourg", "macau", "macedonia",
    "madagascar", "malawi", "malaysia", "maldives", "mali", "malta",
    "marshall islands", "mauritania", "mauritius", "mexico", "micronesia",
    "moldova", "monaco", "mongolia", "montenegro", "morocco", "mozambique",
    "namibia", "nauru", "nepal", "netherlands", "netherlands antilles",
    "new zealand", "nicaragua", "niger", "nigeria", "north korea", "norway",
    "oman", "pakistan", "palau", "palestinian territories", "panama",
    "papua new guinea", "paraguay", "peru", "philippines", "poland", "portugal",
    "qatar", "republic of the congo", "romania", "russia", "rwanda",
    "saint kitts and nevis", "saint lucia", "saint vincent and the grenadines",
    "samoa", "san marino", "sao tome and principe", "saudi arabia", "senegal",
    "serbia", "seychelles", "sierra leone", "singapore", "sint maarten",
    "slovakia", "slovenia", "solomon islands", "somalia", "south africa",
    "south korea", "south sudan", "spain", "sri lanka", "sudan", "suriname",
    "swaziland", "sweden", "switzerland", "syria", "taiwan", "tajikistan",
    "tanzania", "thailand", "timor leste", "togo", "tonga", "trinidad and tobago",
    "tunisia", "turkey", "turkmenistan", "tuvalu", "uganda", "ukraine",
    "united arab emirates", "united kingdom", "united states", "uruguay",
    "uzbekistan", "vanuatu", "venezuela", "vietnam", "yemen", "zambia", "zimbabwe"
}

# ===============================
# Feature creation (updated)
# ===============================
def create_features(df):
    df['text_length'] = df['value'].apply(lambda x: len(str(x)))
    df['num_digits'] = df['value'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df['num_letters'] = df['value'].apply(lambda x: sum(c.isalpha() for c in str(x)))
    df['num_spaces'] = df['value'].apply(lambda x: sum(c.isspace() for c in str(x)))
    df['has_plus'] = df['value'].apply(lambda x: '+' in str(x))
    df['has_paren'] = df['value'].apply(lambda x: '(' in str(x) or ')' in str(x))
    df['has_hyphen'] = df['value'].apply(lambda x: '-' in str(x))
    df['has_slash'] = df['value'].apply(lambda x: '/' in str(x))
    df['has_dot'] = df['value'].apply(lambda x: '.' in str(x))
    df['is_in_country_list'] = df['value'].apply(lambda x: str(x).lower() in COUNTRIES)
    return df

# ===============================
# New classification function
# ===============================
def classify_column_ml(column: pd.Series, model, feature_cols):
    df_to_predict = pd.DataFrame({'value': column.dropna()})
    if df_to_predict.empty:
        return {"prediction": "Other", "scores": {"Other": 1.0}}

    df_features = create_features(df_to_predict)
    X_predict = df_features[feature_cols]

    predictions = model.predict(X_predict)
    prediction_counts = pd.Series(predictions).value_counts(normalize=True)
    most_common_pred = prediction_counts.idxmax()
    confidence = prediction_counts.max()

    return {"prediction": most_common_pred, "scores": {most_common_pred: confidence}}

# ===============================
# Main Execution Logic
# ===============================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Performs semantic classification using a trained ML model."
    )
    parser.add_argument("--input", type=str, required=True, help="The path to the input CSV file.")
    parser.add_argument("--column", type=str, required=True, help="The name of the column to classify.")

    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: The file '{args.input}' does not exist.")
        sys.exit(1)

    try:
        df = pd.read_csv(args.input)
    except Exception as e:
        print(f"Error: Failed to read the file. Details: {e}")
        sys.exit(1)

    if args.column not in df.columns:
        print(f"Error: The column '{args.column}' was not found in the file.")
        sys.exit(1)

    # Load the trained model and feature list
    try:
        model = joblib.load('semantic_model.pkl')
        feature_cols = joblib.load('feature_cols.pkl')
    except FileNotFoundError:
        print("Error: Trained model 'semantic_model.pkl' or 'feature_cols.pkl' not found. Please run the training steps first.")
        sys.exit(1)

    result = classify_column_ml(df[args.column], model, feature_cols)

    print(f"Input File: {args.input}")
    print(f"Column Name: {args.column}")
    print("--- Classification Result (ML) ---")
    print(f"Prediction: {result['prediction']}")
    print(f"Scores: {result['scores']}")

In [None]:
%%writefile parser.py

import sys
import os
import argparse
import pandas as pd
import joblib
import re

# ==================================
# Import the classification pipeline
# ==================================
try:
    from predict import classify_column_ml, create_features, COUNTRIES
except ImportError:
    print("Error: 'predict.py' not found. Please ensure it is in the same directory and contains classify_column_ml, create_features, and COUNTRIES.")
    sys.exit(1)

# ==================================
# Load the trained model and feature list
# This must be done at the top of the script
# ==================================
try:
    model = joblib.load('semantic_model.pkl')
    feature_cols = joblib.load('feature_cols.pkl')
except FileNotFoundError:
    print("Error: Trained model files 'semantic_model.pkl' or 'feature_cols.pkl' not found.")
    print("Please run the training steps first to create the model and feature files.")
    sys.exit(1)


# ==================================
# Country Code Mapping (extendable)
# ==================================
COUNTRY_CODE_MAP = {
    "91": "India", "1": "United States", "44": "United Kingdom", "49": "Germany", "33": "France",
    "39": "Italy", "81": "Japan", "86": "China", "7": "Russia", "61": "Australia",
    "55": "Brazil", "27": "South Africa", "34": "Spain", "82": "South Korea",
}

# ==================================
# Legal Suffixes from legal.txt
# ==================================
LEGAL_TERMS_CONTENT = """
a spol, aat, aansprakelijkheid, ab, actien gesellschaft, actiengesellschaft, actions, ad, ae, ag, agreement, aj, akc spol, akciova, aktiebolag, aktien, andelsselskab, allmennaksjeselskap, anonimi, aksjeselskap, aktiengesellschaft, aktzii, amba, anonim ortakligi, anonim sirketi, anpartsselskab, ans, ansvar, ansvarlig, ao, aps, as, asa, auf, av, avoin, ay, begraenset, beperkta, berhad, beschrankter, besloten, beteti, bhd, bt, bv, bvba, cb, cic, cl, co, commandite, commanditaire, community, commv, compagnie, company, cooperatief ua, cooperatieve, cooperative, corp, corporation, cpt, cuideachta, cv, da, dat, dd, de, dinteret, dionicko, dno, doo, dooel, drustvo, druzhestvo, economique, ee, en, de, por, einkahlutafelag, ehf, entreprise, esv, etaireia, eurl, fdn, felelossegu, foundation, fp, free, fz, fzco, fze, general partnership, gesellschaft, gie, gmbh, gmbh co kg, gp, groupement, gte, haftung, haftungsbeschrankt, handelsbolag, hb, helseforetak, hf, inc, incorporated, interest, ipjsc, is, javno, jtd, jawna, kb, kd, kda, kft, kg, komanditsabiedriba, komandytne, kommanditgesellschaft, kgaa, kht, kommandiittiyhtiö, kommandit, kozhasznu, kozos, kommanditbolag, korlatolt, ks, kt, kv, ky, lc, lda, limitada, limitata, limited, limitee, limited liability, llc, lllp, llp, lp, lt, ltd, ltda, ltee, lv, mb, mbh, mchj, med, met, mit, nl, nuf, nv, nyrt, oaj, oao, od, odgovornoscu, odgovornost, odpowiedzialnoscia, oe, offene, og, one person, ood, ooo, opc, ogranicena, ogranicenom, ograniczona, osakeyhtio, oy, oyj, pa, partnerska, pc, per, phoibli, pjs, plc, pllc, plt, pp, ppa, private, privee, professional, proprietary, ps, pte, pto, pty, pty ltd, public, public joint stock company, pvt, qk, responsabilite, responsabilita, rt, s de rl, s en c, sa, sae, sal, saoc, saog, sapa, sar, sarl, sas, sasu, sca, scpa, scra, scs, sd, sdn, sdn bhd, se, secs, selskap, sendirin, ses, sf, sgp, sha, sia, sicar, sicav, simplifiee, ska, sl, slp, slne, smba, smcprivate, smcpvt, smpc, snc, soccol, sociedad anonima operadora, societa per azioni, societe anonyme, sp z oo, sp zoo, spzoo, spa, spj, spk, spol s ro, spolecnost, spolka zoo, spp, sprl, srl, sro, ss, stjornarvold, stg, tapui, tarsasag, teo, theoranta, tov, tovarystvo, trgovacko, obch spol, uab, ug, ultd, unipersonnelle, unlimited, unltd, vallalat, vat, vennootschap, verwaltungsgesellschaft, vof, vos, vzw, xk, yhtio, yoaj, zat, zone, zrt, kscp, ab publ, ab public, qpsc, company qpsc, sjsc, co sjsc, pjsc, pcl, public company limited, saa, sai, tas, corporation sjsc, bsc, abp, publikt aktiebolag, saic, sa esp, a s, co kscp, company kscp, saf, real estate investment trust, reit, saa, pjsc, kk, kk, kabushiki kaisha, sociedad anonima bursatil de capital variable, sab de cv, sab, joint stock company, jsc, company jsc, saai, sociedad anonima agricola industrial, sacifia, anonima comercial industrial financiera inmobiliaria y agropecuaria, saci, sakp, as, tao, bancorp, bancorporation, fc spa, football club spa, shpk, shk, shoqeri, pergjegjesi, kufizuar, aksionere, komandite, kolektive, dege, zyre, perfaqesimit, responsabilidad, comandita simple, acciones, colectiva, capital industria, estado, garantia, reciproca, simplificada, unipersonal, soc col, scei, sgr, sau, ilp, akcionarsko, neogranicenom, solidarnom, komanditno, samostalni, preduzetnik, sociedade, simples, coletivo, cooperativa, publica, privada, publico, eirl, empresa, individual, sc
"""
LEGAL_TERMS = {t.strip().lower() for t in LEGAL_TERMS_CONTENT.strip().split(",")}

# ==================================
# Helper: Phone Number Parsing
# ==================================
def parse_phone_number(phone):
    phone = str(phone).strip()
    country, number = None, None
    clean_phone = re.sub(r"[^\d+]", "", phone)

    if clean_phone.startswith('+'):
        for code, country_name in COUNTRY_CODE_MAP.items():
            if clean_phone.startswith('+' + code):
                country = country_name
                number = clean_phone[1+len(code):]
                return country, number
    return None, phone

# ==================================
# Helper: Company Name Parsing
# ==================================
def parse_company_name(company):
    company = str(company).strip()
    lower_company = company.lower()

    sorted_legal_terms = sorted(list(LEGAL_TERMS), key=len, reverse=True)

    for suffix in sorted_legal_terms:
        if lower_company.endswith(f" {suffix}"):
            idx = lower_company.rfind(f" {suffix}")
            name = company[:idx].strip()
            legal = company[idx:].strip()
            return name, legal
    return company, ""

# ==================================
# Main Parsing Logic
# ==================================
def process_file(input_file):
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        print(f"Error: Failed to read file. {e}")
        sys.exit(1)

    parsed_cols = []

    for col in df.columns:
        # Pass the model and features to the classification function
        result = classify_column_ml(df[col], model, feature_cols)
        pred = result["prediction"]
        conf = result["scores"][pred] if "scores" in result and pred in result["scores"] else 0

        if pred in ["Phone Number", "Company Name"] and conf > 0.5:
            parsed_cols.append((col, pred, conf))

    if not parsed_cols:
        print("No PhoneNumber or CompanyName column detected with sufficient confidence.")
        sys.exit(0)

    parsed_cols.sort(key=lambda x: x[2], reverse=True)

    print("Detected columns for parsing:")
    for col, pred, conf in parsed_cols:
        print(f"- Column '{col}': {pred} (confidence={conf:.2f})")

    output_df = pd.DataFrame()

    for col, pred, conf in parsed_cols:
        if pred == "Phone Number":
            output_df["PhoneNumber"] = df[col]
            parsed = df[col].apply(lambda x: pd.Series(parse_phone_number(x)))
            parsed.columns = ["Country", "Number"]
            output_df = pd.concat([output_df, parsed], axis=1)

        elif pred == "Company Name":
            output_df["CompanyName"] = df[col]
            parsed = df[col].apply(lambda x: pd.Series(parse_company_name(x)))
            parsed.columns = ["Name", "Legal"]
            output_df = pd.concat([output_df, parsed], axis=1)

    output_df.to_csv("output.csv", index=False)
    print("\nParsing complete. Output written to output.csv")

# ==================================
# CLI
# ==================================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Parses PhoneNumber/CompanyName columns into normalized fields."
    )
    parser.add_argument(
        "--input",
        type=str,
        required=True,
        help="Path to input CSV file"
    )

    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: File '{args.input}' does not exist.")
        sys.exit(1)

    process_file(args.input)

In [None]:
!python parser.py --input phone.csv

In [None]:
import sys
import os
import pandas as pd
import joblib
from sklearn.metrics import classification_report

# ==================================
# Import the classification pipeline
# ==================================
try:
    from predict import create_features, COUNTRIES
except ImportError:
    print("Error: 'predict.py' not found. Please ensure it is in the same directory.")
    sys.exit(1)

# ==================================
# Load the trained model and feature list
# ==================================
try:
    model = joblib.load('semantic_model.pkl')
    feature_cols = joblib.load('feature_cols.pkl')
except FileNotFoundError:
    print("Error: Trained model files 'semantic_model.pkl' or 'feature_cols.pkl' not found.")
    print("Please run the training steps first to create the model and feature files.")
    sys.exit(1)

# ==================================
# Helper function to classify a DataFrame
# ==================================
def classify_data(df, model, feature_cols):
    df_features = create_features(df.copy())
    X_predict = df_features[feature_cols]
    return model.predict(X_predict)

# ==================================
# Main Evaluation Logic
# ==================================
if __name__ == "__main__":

    # 1. Evaluate on phone.csv
    print("\n" + "="*50)
    print("CLASSIFICATION REPORT FOR PHONE NUMBERS")
    print("="*50)
    try:
        df_phone = pd.read_csv('phone.csv', names=['value']).dropna()
        df_phone['label'] = 'Phone Number'
        predictions_phone = classify_data(df_phone, model, feature_cols)
        print(classification_report(df_phone['label'], predictions_phone, zero_division=0))
    except FileNotFoundError:
        print("Error: 'phone.csv' not found.")

    # 2. Evaluate on company.csv
    print("\n" + "="*50)
    print("CLASSIFICATION REPORT FOR COMPANY NAMES")
    print("="*50)
    try:
        df_company = pd.read_csv('company.csv', names=['value']).dropna()
        df_company['label'] = 'Company Name'
        predictions_company = classify_data(df_company, model, feature_cols)
        print(classification_report(df_company['label'], predictions_company, zero_division=0))
    except FileNotFoundError:
        print("Error: 'company.csv' not found.")

    # 3. Evaluate on dates.csv
    print("\n" + "="*50)
    print("CLASSIFICATION REPORT FOR DATES")
    print("="*50)
    try:
        df_dates = pd.read_csv('dates.csv', names=['value']).dropna()
        df_dates['label'] = 'Date'
        predictions_dates = classify_data(df_dates, model, feature_cols)
        print(classification_report(df_dates['label'], predictions_dates, zero_division=0))
    except FileNotFoundError:
        print("Error: 'dates.csv' not found.")

    # 4. Evaluate on countries.txt
    print("\n" + "="*50)
    print("CLASSIFICATION REPORT FOR COUNTRIES")
    print("="*50)
    try:
        df_country = pd.read_csv('countries.txt', names=['value']).dropna()
        df_country['label'] = 'Country'
        predictions_country = classify_data(df_country, model, feature_cols)
        print(classification_report(df_country['label'], predictions_country, zero_division=0))
    except FileNotFoundError:
        print("Error: 'countries.txt' not found.")

In [None]:
!python parser.py --input phone.csv

In [None]:
%%writefile parser.py

import sys
import os
import argparse
import pandas as pd
import joblib
import re

# ==================================
# Import the classification pipeline
# ==================================
try:
    from predict import classify_column_ml, create_features, COUNTRIES
except ImportError:
    print("Error: 'predict.py' not found. Please ensure it is in the same directory and contains classify_column_ml, create_features, and COUNTRIES.")
    sys.exit(1)

# ==================================
# Load the trained model and feature list
# ==================================
try:
    model = joblib.load('semantic_model.pkl')
    feature_cols = joblib.load('feature_cols.pkl')
except FileNotFoundError:
    print("Error: Trained model files 'semantic_model.pkl' or 'feature_cols.pkl' not found.")
    print("Please run the training steps first to create the model and feature files.")
    sys.exit(1)

# ==================================
# Country Code Mapping based on `countries.txt`
# ==================================
COUNTRY_CODE_MAP = {
    "91": "India", "1": "United States", "44": "United Kingdom", "49": "Germany", "33": "France",
    "39": "Italy", "81": "Japan", "86": "China", "7": "Russia", "61": "Australia",
    "55": "Brazil", "27": "South Africa", "34": "Spain", "82": "South Korea",
}
# Fallback to load countries from file if not directly available
try:
    with open('countries.txt', 'r', encoding='utf-8') as f:
        countries_from_file = {line.strip().lower() for line in f if line.strip()}
    # Merging with COUNTRIES from predict.py for comprehensive lookup
    all_countries = COUNTRIES.union(countries_from_file)
except FileNotFoundError:
    all_countries = COUNTRIES

# ==================================
# Legal Suffixes from legal.txt
# ==================================
LEGAL_TERMS = set()
try:
    with open('legal.txt', 'r', encoding='utf-8') as f:
        LEGAL_TERMS = {line.strip().lower() for line in f if line.strip()}
except FileNotFoundError:
    print("Warning: 'legal.txt' not found. Company Name parsing may be less accurate.")
    # Fallback with some common terms if file is missing
    LEGAL_TERMS = {"ltd", "inc", "corp", "co", "gmbh", "ag", "llc", "plc", "sa"}
sorted_legal_terms = sorted(list(LEGAL_TERMS), key=len, reverse=True)

# ==================================
# Helper: Phone Number Parsing
# ==================================
def parse_phone_number(phone):
    phone = str(phone).strip()
    country, number = "", phone
    clean_phone = re.sub(r"[^\d+]", "", phone)

    # Check for a country code at the beginning
    if clean_phone.startswith('+'):
        for code, country_name in COUNTRY_CODE_MAP.items():
            if clean_phone.startswith('+' + code):
                country = country_name
                number = clean_phone[1+len(code):]
                return country, number

    return country, phone

# ==================================
# Helper: Company Name Parsing
# ==================================
def parse_company_name(company):
    company = str(company).strip()
    lower_company = company.lower()

    for suffix in sorted_legal_terms:
        if lower_company.endswith(f" {suffix}"):
            idx = lower_company.rfind(f" {suffix}")
            name = company[:idx].strip()
            legal = company[idx:].strip()
            return name, legal
    return company, ""

# ==================================
# Main Parsing Logic
# ==================================
def process_file(input_file):
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        print(f"Error: Failed to read file. {e}")
        sys.exit(1)

    parsed_data = []

    # Identify and process all columns in the dataframe
    for col in df.columns:
        # Use the ML model to classify the column
        result = classify_column_ml(df[col], model, feature_cols)
        pred = result["prediction"]
        conf = result["scores"][pred] if "scores" in result and pred in result["scores"] else 0

        # Only process columns with a prediction of 'Phone Number' or 'Company Name'
        if pred == "Phone Number" and conf > 0.5:
            # Create the DataFrame with the requested columns for Phone Numbers
            phone_df = pd.DataFrame(df[col])
            phone_df.columns = ['PhoneNumber']
            parsed = phone_df['PhoneNumber'].apply(lambda x: pd.Series(parse_phone_number(x)))
            parsed.columns = ["Country", "Number"]
            final_df = pd.concat([phone_df, parsed], axis=1)
            parsed_data.append(final_df)
            print(f"Table for '{col}' (Phone Number) generated.")
            print(final_df.to_markdown(index=False)) # Print a markdown table

        elif pred == "Company Name" and conf > 0.5:
            # Create the DataFrame with the requested columns for Company Names
            company_df = pd.DataFrame(df[col])
            company_df.columns = ['CompanyName']
            parsed = company_df['CompanyName'].apply(lambda x: pd.Series(parse_company_name(x)))
            parsed.columns = ["Name", "Legal"]
            final_df = pd.concat([company_df, parsed], axis=1)
            parsed_data.append(final_df)
            print(f"Table for '{col}' (Company Name) generated.")
            print(final_df.to_markdown(index=False)) # Print a markdown table

        else:
            print(f"Ignoring column '{col}' with prediction '{pred}'.")

    if parsed_data:
        # Merge all parsed dataframes into a single output file
        final_output_df = pd.concat(parsed_data, axis=1)
        final_output_df.to_csv("output.csv", index=False)
        print("\nParsing complete. Combined output written to output.csv")
    else:
        print("\nNo 'Phone Number' or 'Company Name' columns detected with sufficient confidence.")

# ==================================
# CLI
# ==================================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Parses PhoneNumber/CompanyName columns into normalized fields."
    )
    parser.add_argument(
        "--input",
        type=str,
        required=True,
        help="Path to input CSV file"
    )

    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: File '{args.input}' does not exist.")
        sys.exit(1)

    process_file(args.input)

In [None]:
!python parser.py --input phone.csv

In [None]:
%%writefile parser.py

import sys
import os
import argparse
import pandas as pd
import joblib
import re

# ==================================
# Import the classification pipeline
# ==================================
try:
    from predict import classify_column_ml, create_features, COUNTRIES
except ImportError:
    print("Error: 'predict.py' not found. Please ensure it is in the same directory and contains classify_column_ml, create_features, and COUNTRIES.")
    sys.exit(1)

# ==================================
# Load the trained model and feature list
# ==================================
try:
    model = joblib.load('semantic_model.pkl')
    feature_cols = joblib.load('feature_cols.pkl')
    # FIX: Load the text_vectorizer as it's required by classify_column_ml
    text_vectorizer = joblib.load('text_vectorizer.pkl')
except FileNotFoundError:
    print("Error: Trained model files 'semantic_model.pkl', 'feature_cols.pkl' or 'text_vectorizer.pkl' not found.")
    print("Please run the training steps first to create the model and feature files.")
    sys.exit(1)

# ==================================
# Country Code Mapping based on `countries.txt`
# ==================================
COUNTRY_CODE_MAP = {
    "91": "India", "1": "United States", "44": "United Kingdom", "49": "Germany", "33": "France",
    "39": "Italy", "81": "Japan", "86": "China", "7": "Russia", "61": "Australia",
    "55": "Brazil", "27": "South Africa", "34": "Spain", "82": "South Korea",
}
# Fallback to load countries from file if not directly available
try:
    with open('countries.txt', 'r', encoding='utf-8') as f:
        countries_from_file = {line.strip().lower() for line in f if line.strip()}
    # Merging with COUNTRIES from predict.py for comprehensive lookup
    all_countries = COUNTRIES.union(countries_from_file)
except FileNotFoundError:
    all_countries = COUNTRIES

# ==================================
# Legal Suffixes from legal.txt
# ==================================
LEGAL_TERMS = set()
try:
    with open('legal.txt', 'r', encoding='utf-8') as f:
        LEGAL_TERMS = {line.strip().lower() for line in f if line.strip()}
except FileNotFoundError:
    print("Warning: 'legal.txt' not found. Company Name parsing may be less accurate.")
    # Fallback with some common terms if file is missing
    LEGAL_TERMS = {"ltd", "inc", "corp", "co", "gmbh", "ag", "llc", "plc", "sa"}
sorted_legal_terms = sorted(list(LEGAL_TERMS), key=len, reverse=True)

# ==================================
# Helper: Phone Number Parsing
# ==================================
def parse_phone_number(phone):
    phone = str(phone).strip()
    country, number = "", phone
    clean_phone = re.sub(r"[^\d+]", "", phone)

    # Check for a country code at the beginning
    if clean_phone.startswith('+'):
        for code, country_name in COUNTRY_CODE_MAP.items():
            if clean_phone.startswith('+' + code):
                country = country_name
                number = clean_phone[1+len(code):]
                return country, number

    return country, phone

# ==================================
# Helper: Company Name Parsing
# ==================================
def parse_company_name(company):
    company = str(company).strip()
    lower_company = company.lower()

    for suffix in sorted_legal_terms:
        if lower_company.endswith(f" {suffix}"):
            idx = lower_company.rfind(f" {suffix}")
            name = company[:idx].strip()
            legal = company[idx:].strip()
            return name, legal
    return company, ""

# ==================================
# Main Parsing Logic
# ==================================
def process_file(input_file):
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        print(f"Error: Failed to read file. {e}")
        sys.exit(1)

    parsed_data = []

    # Identify and process all columns in the dataframe
    for col in df.columns:
        # Use the ML model to classify the column
        # FIX: Pass the text_vectorizer as an argument
        result = classify_column_ml(df[col], model, feature_cols, text_vectorizer)
        pred = result["prediction"]
        # FIX: The predict.py function returns 'confidence', not 'scores'.
        conf = result["confidence"] if "confidence" in result else 0

        # Only process columns with a prediction of 'Phone Number' or 'Company Name'
        if pred == "Phone Number" and conf > 0.5:
            # Create the DataFrame with the requested columns for Phone Numbers
            phone_df = pd.DataFrame(df[col])
            phone_df.columns = ['PhoneNumber']
            parsed = phone_df['PhoneNumber'].apply(lambda x: pd.Series(parse_phone_number(x)))
            parsed.columns = ["Country", "Number"]
            final_df = pd.concat([phone_df, parsed], axis=1)
            parsed_data.append(final_df)
            print(f"Table for '{col}' (Phone Number) generated.")
            print(final_df.to_markdown(index=False)) # Print a markdown table

        elif pred == "Company Name" and conf > 0.5:
            # Create the DataFrame with the requested columns for Company Names
            company_df = pd.DataFrame(df[col])
            company_df.columns = ['CompanyName']
            parsed = company_df['CompanyName'].apply(lambda x: pd.Series(parse_company_name(x)))
            parsed.columns = ["Name", "Legal"]
            final_df = pd.concat([company_df, parsed], axis=1)
            parsed_data.append(final_df)
            print(f"Table for '{col}' (Company Name) generated.")
            print(final_df.to_markdown(index=False)) # Print a markdown table

        else:
            print(f"Ignoring column '{col}' with prediction '{pred}'.")

    if parsed_data:
        # Merge all parsed dataframes into a single output file
        final_output_df = pd.concat(parsed_data, axis=1)
        final_output_df.to_csv("output.csv", index=False)
        print("\nParsing complete. Combined output written to output.csv")
    else:
        print("\nNo 'Phone Number' or 'Company Name' columns detected with sufficient confidence.")

# ==================================
# CLI
# ==================================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Parses PhoneNumber/CompanyName columns into normalized fields."
    )
    parser.add_argument(
        "--input",
        type=str,
        required=True,
        help="Path to input CSV file"
    )

    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: File '{args.input}' does not exist.")
        sys.exit(1)

    process_file(args.input)

In [None]:
!python predict.py --input company.csv --"company"

In [None]:
%%writefile parser.py

import sys
import os
import argparse
import pandas as pd
import joblib
import re

# ==================================
# Import the classification pipeline
# ==================================
try:
    from predict import classify_column_ml, create_features, COUNTRIES
except ImportError:
    print("Error: 'predict.py' not found. Please ensure it is in the same directory and contains classify_column_ml, create_features, and COUNTRIES.")
    sys.exit(1)

# ==================================
# Load the trained model and feature list
# ==================================
try:
    model = joblib.load('semantic_model.pkl')
    feature_cols = joblib.load('feature_cols.pkl')
    text_vectorizer = joblib.load('text_vectorizer.pkl')
except FileNotFoundError:
    print("Error: Trained model files 'semantic_model.pkl', 'feature_cols.pkl' or 'text_vectorizer.pkl' not found.")
    print("Please run the training steps first to create the model and feature files.")
    sys.exit(1)

# ==================================
# Country Code Mapping based on `countries.txt`
# ==================================
COUNTRY_CODE_MAP = {
    "91": "India", "1": "United States", "44": "United Kingdom", "49": "Germany", "33": "France",
    "39": "Italy", "81": "Japan", "86": "China", "7": "Russia", "61": "Australia",
    "55": "Brazil", "27": "South Africa", "34": "Spain", "82": "South Korea",
}
# Fallback to load countries from file if not directly available
try:
    with open('countries.txt', 'r', encoding='utf-8') as f:
        countries_from_file = {line.strip().lower() for line in f if line.strip()}
    all_countries = COUNTRIES.union(countries_from_file)
except FileNotFoundError:
    all_countries = COUNTRIES

# ==================================
# Legal Suffixes from legal.txt
# ==================================
LEGAL_TERMS = set()
try:
    with open('legal.txt', 'r', encoding='utf-8') as f:
        LEGAL_TERMS = {line.strip().lower() for line in f if line.strip()}
except FileNotFoundError:
    print("Warning: 'legal.txt' not found. Company Name parsing may be less accurate.")
    LEGAL_TERMS = {"ltd", "inc", "corp", "co", "gmbh", "ag", "llc", "plc", "sa"}
sorted_legal_terms = sorted(list(LEGAL_TERMS), key=len, reverse=True)

# ==================================
# Helper: Phone Number Parsing
# ==================================
def parse_phone_number(phone):
    phone = str(phone).strip()
    country, number = "", phone
    clean_phone = re.sub(r"[^\d+]", "", phone)

    # Check for a country code at the beginning
    if clean_phone.startswith('+'):
        for code, country_name in COUNTRY_CODE_MAP.items():
            if clean_phone.startswith('+' + code):
                country = country_name
                # Remove the country code to get the number
                number = clean_phone[1+len(code):]
                return country, number

    return country, phone

# ==================================
# Helper: Company Name Parsing
# ==================================
def parse_company_name(company):
    company = str(company).strip()
    lower_company = company.lower()

    for suffix in sorted_legal_terms:
        if lower_company.endswith(f" {suffix.lower()}"):
            idx = lower_company.rfind(f" {suffix.lower()}")
            name = company[:idx].strip()
            legal = company[idx:].strip()
            return name, legal
    return company, ""

# ==================================
# Main Parsing Logic
# ==================================
def process_file(input_file):
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        print(f"Error: Failed to read file. {e}")
        sys.exit(1)

    parsed_data = []

    for col in df.columns:
        result = classify_column_ml(df[col], model, feature_cols, text_vectorizer)
        pred = result["prediction"]
        conf = result["confidence"]

        if pred == "Phone Number" and conf > 0.5:
            phone_df = df[[col]].rename(columns={col: 'PhoneNumber'})
            parsed = phone_df['PhoneNumber'].apply(lambda x: pd.Series(parse_phone_number(x)))
            parsed.columns = ["Country", "Number"]
            final_df = pd.concat([phone_df, parsed], axis=1)
            parsed_data.append(final_df)

        elif pred == "Company Name" and conf > 0.5:
            company_df = df[[col]].rename(columns={col: 'CompanyName'})
            parsed = company_df['CompanyName'].apply(lambda x: pd.Series(parse_company_name(x)))
            parsed.columns = ["Name", "Legal"]
            final_df = pd.concat([company_df, parsed], axis=1)
            parsed_data.append(final_df)

    if parsed_data:
        final_output_df = pd.concat(parsed_data, axis=1)
        final_output_df.to_csv("output.csv", index=False)
        print("Parsing complete. Output written to output.csv")
    else:
        print("No valid columns detected for parsing.")

# ==================================
# CLI
# ==================================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Parses data columns into normalized fields."
    )
    parser.add_argument(
        "--input",
        type=str,
        required=True,
        help="Path to input CSV file"
    )
    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: File '{args.input}' does not exist.")
        sys.exit(1)

    process_file(args.input)

In [None]:
%%writefile predict.py

import sys
import os
import argparse
import pandas as pd
import joblib
import re

# ===============================
# Country list (directly encoded)
# ===============================
COUNTRIES = {
    "afghanistan", "albania", "algeria", "andorra", "angola", "antigua and barbuda",
    "argentina", "armenia", "aruba", "australia", "austria", "azerbaijan",
    "bahamas", "bahrain", "bangladesh", "barbados", "belarus", "belgium",
    "belize", "benin", "bhutan", "bolivia", "bosnia and herzegovina", "botswana",
    "brazil", "brunei", "bulgaria", "burkina faso", "burma", "burundi",
    "cambodia", "cameroon", "canada", "cape verde", "central african republic",
    "chad", "chile", "china", "colombia", "comoros", "costa rica", "cote d'ivoire",
    "croatia", "cuba", "curacao", "cyprus", "czech republic",
    "democratic republic of the congo", "denmark", "djibouti", "dominica",
    "dominican republic", "east timor", "ecuador", "egypt", "el salvador",
    "equatorial guinea", "eritrea", "estonia", "ethiopia", "fiji", "finland",
    "france", "gabon", "gambia", "georgia", "germany", "ghana", "greece",
    "grenada", "guatemala", "guinea", "guinea bissau", "guyana", "haiti",
    "holy see", "honduras", "hong kong", "hungary", "iceland", "india",
    "indonesia", "iran", "iraq", "ireland", "israel", "italy", "jamaica",
    "japan", "jordan", "kazakhstan", "kenya", "kiribati", "kosovo", "kuwait",
    "kyrgyzstan", "laos", "latvia", "lebanon", "lesotho", "liberia", "libya",
    "liechtenstein", "lithuania", "luxembourg", "macau", "macedonia",
    "madagascar", "malawi", "malaysia", "maldives", "mali", "malta",
    "marshall islands", "mauritania", "mauritius", "mexico", "micronesia",
    "moldova", "monaco", "mongolia", "montenegro", "morocco", "mozambique",
    "namibia", "nauru", "nepal", "netherlands", "netherlands antilles",
    "new zealand", "nicaragua", "niger", "nigeria", "north korea", "norway",
    "oman", "pakistan", "palau", "palestinian territories", "panama",
    "papua new guinea", "paraguay", "peru", "philippines", "poland", "portugal",
    "qatar", "republic of the congo", "romania", "russia", "rwanda",
    "saint kitts and nevis", "saint lucia", "saint vincent and the grenadines",
    "samoa", "san marino", "sao tome and principe", "saudi arabia", "senegal",
    "serbia", "seychelles", "sierra leone", "singapore", "sint maarten",
    "slovakia", "slovenia", "solomon islands", "somalia", "south africa",
    "south korea", "south sudan", "spain", "sri lanka", "sudan", "suriname",
    "swaziland", "sweden", "switzerland", "syria", "taiwan", "tajikistan",
    "tanzania", "thailand", "timor leste", "togo", "tonga", "trinidad and tobago",
    "tunisia", "turkey", "turkmenistan", "tuvalu", "uganda", "ukraine",
    "united arab emirates", "united kingdom", "united states", "uruguay",
    "uzbekistan", "vanuatu", "venezuela", "vietnam", "yemen", "zambia", "zimbabwe"
}


# ===============================
# Feature creation (updated)
# ===============================
def create_features(df):
    df['text_length'] = df['value'].apply(lambda x: len(str(x)))
    df['num_digits'] = df['value'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    df['num_letters'] = df['value'].apply(lambda x: sum(c.isalpha() for c in str(x)))
    df['num_spaces'] = df['value'].apply(lambda x: sum(c.isspace() for c in str(x)))
    df['has_plus'] = df['value'].apply(lambda x: '+' in str(x))
    df['has_paren'] = df['value'].apply(lambda x: '(' in str(x) or ')' in str(x))
    df['has_hyphen'] = df['value'].apply(lambda x: '-' in str(x))
    df['has_slash'] = df['value'].apply(lambda x: '/' in str(x))
    df['has_dot'] = df['value'].apply(lambda x: '.' in str(x))
    # New feature to directly check for countries
    df['is_in_country_list'] = df['value'].apply(lambda x: str(x).lower() in COUNTRIES)
    return df

# ===============================
# New classification function
# ===============================
def classify_column_ml(column: pd.Series):
    df_to_predict = pd.DataFrame({'value': column.dropna()})
    if df_to_predict.empty:
        return {"prediction": "Other"}

    df_features = create_features(df_to_predict)

    # Correct feature columns must match what the model was trained on
    feature_cols = ['text_length', 'num_digits', 'num_letters', 'num_spaces',
                    'has_plus', 'has_paren', 'has_hyphen', 'has_slash', 'has_dot', 'is_in_country_list']
    X_predict = df_features[feature_cols]

    # Get predictions
    predictions = model.predict(X_predict)

    # Determine the most common prediction and its frequency
    prediction_counts = pd.Series(predictions).value_counts(normalize=True)
    most_common_pred = prediction_counts.idxmax()
    confidence = prediction_counts.max()

    return {"prediction": most_common_pred, "scores": {most_common_pred: confidence}}

# ===============================
# Main Execution Logic (remains the same)
# ===============================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Performs semantic classification using a trained ML model."
    )
    parser.add_argument(
        "--input",
        type=str,
        required=True,
        help="The path to the input CSV file."
    )
    parser.add_argument(
        "--column",
        type=str,
        required=True,
        help="The name of the column to classify."
    )

    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: The file '{args.input}' does not exist.")
        sys.exit(1)

    try:
        df = pd.read_csv(args.input)
    except Exception as e:
        print(f"Error: Failed to read the file. Details: {e}")
        sys.exit(1)

    if args.column not in df.columns:
        print(f"Error: The column '{args.column}' was not found in the file.")
        sys.exit(1)

    # Load the trained model and feature list from the previous steps
    try:
        model = joblib.load('semantic_model.pkl')
    except FileNotFoundError:
        print("Error: Trained model 'semantic_model.pkl' not found. Please run the training steps first.")
        sys.exit(1)

    result = classify_column_ml(df[args.column])

    print(f"Input File: {args.input}")
    print(f"Column Name: {args.column}")
    print("--- Classification Result (ML) ---")
    print(f"Prediction: {result['prediction']}")
    print(f"Scores: {result['scores']}")

In [None]:
!python predict.py --input company.csv --"company"

In [None]:
!python predict.py --input company.csv --company

In [None]:
!python predict.py --input company.csv --column "company"

In [None]:
%%writefile predict.py
import pandas as pd
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import os
import argparse
import sys

# Load data files
try:
    with open('data/countries.txt', 'r', encoding='utf-8') as f:
        COUNTRIES = {line.strip().lower() for line in f if line.strip()}
    with open('data/legal.txt', 'r', encoding='utf-8') as f:
        LEGAL_TERMS = {line.strip().lower() for line in f if line.strip()}
except FileNotFoundError:
    print("Warning: Data files not found. Using default lists.")
    COUNTRIES = {"india", "united states", "united kingdom", "germany"}
    LEGAL_TERMS = {"inc", "ltd", "gmbh"}

# Feature creation function
def create_features(df):
    df['value'] = df['value'].astype(str).str.strip().str.lower()
    df['is_phone_format'] = df['value'].apply(lambda x: bool(re.match(r'^[\s\(\)\+\-]*\d[\d\s\(\)\+\-]{6,20}$', x)))
    df['is_company_legal_term'] = df['value'].apply(lambda x: any(term in x for term in LEGAL_TERMS))
    df['is_numeric'] = df['value'].apply(lambda x: x.replace('.', '', 1).isdigit())
    df['is_country'] = df['value'].apply(lambda x: x in COUNTRIES)
    return df

# A simple training dataset to make the model runnable
data = {
    'value': [
        '+91 9876543210', '(212) 555-1234', 'The Coca-Cola Company', 'Google Inc.',
        '2023-01-15', 'Jan 20, 2024', 'India', 'United Kingdom'
    ],
    'label': [
        'Phone Number', 'Phone Number', 'Company Name', 'Company Name',
        'Date', 'Date', 'Country', 'Country'
    ]
}
train_df = pd.DataFrame(data)

# Create features and train the model
train_df = create_features(train_df)
X = train_df.drop('label', axis=1)
y = train_df['label']

text_features = X['value']
text_vectorizer = TfidfVectorizer(max_features=10)
text_features_transformed = text_vectorizer.fit_transform(text_features)
X_features = X.drop('value', axis=1)
feature_cols = X_features.columns.tolist()

X_combined = pd.concat([pd.DataFrame(text_features_transformed.toarray()), X_features.reset_index(drop=True)], axis=1)
X_combined.columns = [f'tfidf_{i}' for i in range(text_features_transformed.shape[1])] + feature_cols

model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_combined, y)

# Save the model and feature columns for later use
joblib.dump(model, 'semantic_model.pkl')
joblib.dump(X_combined.columns.tolist(), 'feature_cols.pkl')
joblib.dump(text_vectorizer, 'text_vectorizer.pkl')

def classify_column_ml(column_series, model, feature_cols, text_vectorizer):
    df_temp = pd.DataFrame(column_series)
    df_temp.columns = ['value']

    # Create the same features as during training
    df_features = create_features(df_temp)

    text_features = df_features['value']
    text_features_transformed = text_vectorizer.transform(text_features)
    X_features = df_features.drop('value', axis=1)

    X_combined = pd.concat([pd.DataFrame(text_features_transformed.toarray(), index=X_features.index), X_features], axis=1)

    # FIX: Reindex the DataFrame to match the feature names from training
    X_combined_reordered = X_combined.reindex(columns=feature_cols, fill_value=0)

    predictions = model.predict(X_combined_reordered)
    probabilities = model.predict_proba(X_combined_reordered)

    main_prediction = max(set(predictions), key=list(predictions).count)
    confidence = probabilities[0][model.classes_.tolist().index(main_prediction)]

    return {"prediction": main_prediction, "confidence": confidence}

# Main script logic for command-line execution
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Classify the semantic type of a column using a trained ML model.")
    parser.add_argument('--input', type=str, required=True, help='The path to the CSV file to classify.')
    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"Error: File '{args.input}' not found.")
        sys.exit(1)

    try:
        # Load the saved model components
        model = joblib.load('semantic_model.pkl')
        feature_cols = joblib.load('feature_cols.pkl')
        text_vectorizer = joblib.load('text_vectorizer.pkl')

        # Read the entire CSV and assume the first column is the one to classify
        df = pd.read_csv(args.input, header=None)
        column_to_classify = df.iloc[:, 0]

        # Call the classification function
        result = classify_column_ml(column_to_classify, model, feature_cols, text_vectorizer)
        print(f"The column's semantic type is: {result['prediction']}")
        print(f"Confidence score: {result['confidence']:.2f}")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Please ensure the training steps have been run and the model files exist.")
        sys.exit(1)

In [None]:
!python predict.py --input company.csv --"company"