In [None]:
import pandas as pd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def convert_to_numeric(text):
    if text.endswith('Crore+'):
        return float(text.replace(' Crore+', '')) * 1e7
    elif text.endswith('Lac+'):
        return float(text.replace(' Lac+', '')) * 1e5
    elif text.endswith('Thousand+'):
        return float(text.replace(' Thousand+', '')) * 1e3
    elif text.endswith('Hund+'):
        return float(text.replace(' Hund+', '')) * 1e2
    else:
        try:
            return float(text)
        except ValueError:
            return 0.0

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

train_df['Total Assets'] = train_df['Total Assets'].apply(convert_to_numeric)
test_df['Total Assets'] = test_df['Total Assets'].apply(convert_to_numeric)

train_df['Liabilities'] = train_df['Liabilities'].apply(convert_to_numeric)
test_df['Liabilities'] = test_df['Liabilities'].apply(convert_to_numeric)

combined_df = pd.concat([train_df, test_df])

label_encoder = LabelEncoder()
combined_df['state'] = label_encoder.fit_transform(combined_df['state'])
combined_df['Party'] = label_encoder.fit_transform(combined_df['Party'])

label_encoder = LabelEncoder()
combined_df['Education'] = label_encoder.fit_transform(combined_df['Education'])

train_df = combined_df[:len(train_df)]
test_df = combined_df[len(train_df):]

columns_to_drop = ['ID', 'Candidate', 'Constituency ∇']
X_train = train_df.drop(columns=columns_to_drop + ['Education'])
y_train = train_df['Education']
x_test = test_df.drop(columns=columns_to_drop + ['Education'])

model = CategoricalNB(alpha=0.0, min_categories=1414)

model.fit(X_train, y_train)

X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=101)
y_pred = model.predict(X_test_split)
print("Classification Report on Training Data:")
print(classification_report(y_test_split, y_pred))

prediction = model.predict(x_test)
predictions_labels = label_encoder.inverse_transform(prediction)

submission_df = pd.DataFrame({'ID': test_df['ID'], 'Education': predictions_labels})
submission_df.to_csv("submission.csv", index=False)
