In [36]:
import pandas as pd
import numpy as np
import datetime

import pickle
from train import preprocess_features

In [37]:
with open("data/pkls/processed_dataset_classification_first_year.pkl", "rb") as f:
    df_data = pickle.load(f)
with open("data/pkls/processed_dataset_regression_first_year.pkl", "rb") as f:
    df_data_2 = pickle.load(f)
df_raw = pd.read_parquet("data/parquet/companies_feat_open.parquet")

In [38]:
with open("data/pkls/num_scaler_classification_first_year.pkl", "rb") as f:
    scaler = pickle.load(f)
with open("data/pkls/cat_enc_classification_first_year.pkl", "rb") as f:
    encoder = pickle.load(f)

In [39]:
with open("data/models/xgb_classifier_first_year.pkl", "rb") as f:
    model = pickle.load(f)

In [40]:
with open("data/models/xgb_classifier_first_year.pkl", "rb") as f:
    model = pickle.load(f)

In [44]:
predict_by_ogrn = True

def process_row(df):
    """
    Processes the DataFrame by dropping unnecessary columns, modifying specific columns,
    and converting string categories to numerical representations.
    """
    # Define columns to drop
    columns_to_drop = [
        "Наименование / ФИО", "Дата включения в реестр", "ОГРН", "ИНН",
        "reg_date", "lifetime", "ogrn", "opf_id", "okved_id", "inn", "full_name"
    ]
    if sum(df["Основной вид деятельности"] == "No") > 0:
        print("Some companies have not expected type of activity")
        df.drop(df.loc[df["Основной вид деятельности"] == "No"].index, inplace=True)
    ogrns = df["ОГРН"]
    df = df.drop(columns_to_drop, axis=1, errors="ignore")
    # Cut values in some columns
    df["Основной вид деятельности"] = df.apply(
        lambda row: row["Основной вид деятельности"][1:2]
        if row["Основной вид деятельности"][0] == "0"
        else row["Основной вид деятельности"][:2],
        axis=1,
    )
    df["Регион"] = df.apply(
        lambda row: row["Регион"][1:2]
        if row["Регион"][0] == "0"
        else row["Регион"][:2],
        axis=1,
    )

    # Converting string categories into numerical representations
    df["Тип субъекта"] = (df["Тип субъекта"] == "Индивидуальный предприниматель").astype(
        int
    )
    df[["Основной вид деятельности", "Регион"]] = df[
        ["Основной вид деятельности", "Регион"]
    ].astype(int)
    df["Вновь созданный"] = (df["Вновь созданный"] == "Да").astype(int)
    df["Наличие лицензий"] = (df["Наличие лицензий"] == "Да").astype(int)
    df["КатСубМСП"] = df["КатСубМСП"].astype(int)

    return df, ogrns


def predict(row):
    """
    Makes predictions on the processed row using the trained model.
    """
    return model.predict(row), model.predict_proba(row)

def predict_batch(df):
    """
    Applies prediction on a batch (entire DataFrame).
    """
    try:
        if df.isnull().values.any():
            print("Some features are NaN and will be filled with 0.")
            df = df.fillna(0)
        processed_df, ogrns = process_row(df)
        processed_df = preprocess_features(processed_df, scaler, encoder)[0]
        missing_columns = set(column_order) - set(processed_df.columns)
        if missing_columns:
            print(f"Missing features columns: {missing_columns}")
            return pd.DataFrame()  # Return empty DataFrame on failure

        predictions = model.predict(processed_df)
        probabilities = model.predict_proba(processed_df)
        results = pd.DataFrame({
            'OGRN': ogrns,
            'Predicted Class': predictions,
            'Probabilities': [', '.join([f'{p:.2f}' for p in prob]) for prob in probabilities]
        })
        return results
    except Exception as e:
        print(f"An error occurred during batch prediction: {e}")
        return pd.DataFrame() 


if predict_by_ogrn:
    while True:
        user_input = input(
            "Enter an OGRN(IP) to find in the processed Data or 'exit' to quit: "
        )

        if user_input.lower() == "exit":
            print("Exiting the program.")
            break

        try:
            OGRN = int(user_input)
            if OGRN in df_raw["ОГРН"].values:
                row = df_raw[df_raw["ОГРН"] == OGRN] # Get the first matching row

                # Check for NaN values and missing columns
                if row.isnull().values.any():
                    print("Some features are NaN and will be filled with 0.")
                    row = row.fillna(0)

                # Preprocess row
                try:
                    row = process_row(row)[0]
                    row = preprocess_features(row, scaler, encoder)[0]
                except Exception as e:
                    print(f"An error occurred during processing: {e}")
                missing_columns = set(column_order) - set(row.columns)
                if missing_columns:
                    print(f"Missing features columns: {missing_columns}")
                    break
                processed_row = row[column_order]
                # predict
                prediction = predict(processed_row)
                predicted_class = prediction[0][0]
                probabilities_str = ', '.join([f'{p:.2f}' for p in prediction[1][0]])
                print(f"Prediction for the input {OGRN}: class {predicted_class}, probabilities [{probabilities_str}]")

            else:
                print("OGRN not found in the DataFrame. Please try again.")

        except ValueError:
            print("Invalid input. Please enter a valid OGRN.")
else:
    print("Collecting predictions for the entire DataFrame.")
    prediction_results = predict_batch(df_raw)
    if not prediction_results.empty:
        print(prediction_results.head())
    else:
        print("Failed to generate predictions.")

Some features are NaN and will be filled with 0.
Prediction for the input 1232800000850: class 1, probabilities [0.15, 0.78, 0.01, 0.06, 0.00]
Invalid input. Please enter a valid OGRN.
Invalid input. Please enter a valid OGRN.
Invalid input. Please enter a valid OGRN.
OGRN not found in the DataFrame. Please try again.
Some features are NaN and will be filled with 0.
Prediction for the input 1232800000850: class 1, probabilities [0.15, 0.78, 0.01, 0.06, 0.00]
Some features are NaN and will be filled with 0.
Prediction for the input 1232800000850: class 1, probabilities [0.15, 0.78, 0.01, 0.06, 0.00]
Invalid input. Please enter a valid OGRN.
OGRN not found in the DataFrame. Please try again.
OGRN not found in the DataFrame. Please try again.
Some features are NaN and will be filled with 0.
Prediction for the input 1232800000850: class 1, probabilities [0.15, 0.78, 0.01, 0.06, 0.00]
OGRN not found in the DataFrame. Please try again.
OGRN not found in the DataFrame. Please try again.
Inval