In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
X_train = train.iloc[0:2059, 3:8]
y_train = train.iloc[0:2059, 8]

test = pd.read_csv('test.csv')
X_test = test.iloc[0:2059, 3:8]

In [2]:
from scipy.stats import boxcox

def convert_to_numeric(value):
    if isinstance(value, float):
        return value
    elif value == '0':
        return 0.0
    elif 'Crore' in value:
        return float(value.replace(' Crore+', '')) * 10**7
    elif 'Lac' in value:
        return float(value.replace(' Lac+', '')) * 10**5
    elif 'Thou' in value:
        return float(value.replace(' Thou+', '')) * 10**3


X_train['Total Assets'] = X_train['Total Assets'].apply(convert_to_numeric)
X_train['Liabilities'] = X_train['Liabilities'].apply(convert_to_numeric)
X_train['Liabilities'].fillna(0, inplace=True)
X_test['Total Assets'] = X_test['Total Assets'].apply(convert_to_numeric)
X_test['Liabilities'] = X_test['Liabilities'].apply(convert_to_numeric)
X_test['Liabilities'].fillna(0, inplace=True)

X_train = pd.get_dummies(X_train, columns=['state', 'Party'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['state', 'Party'], drop_first=True)

def boxcox_transform(data):
    transformed_data, _ = boxcox(data + 1)
    return transformed_data

X_train['Total Assets'] = boxcox_transform(X_train['Total Assets'])
X_train['Liabilities'] = boxcox_transform(X_train['Liabilities'])
X_test['Liabilities'] = boxcox_transform(X_test['Liabilities'])
X_test['Total Assets'] = boxcox_transform(X_test['Total Assets'])

def clip_outliers(series, lower_percentile=1, upper_percentile=99):
    lower_limit = np.percentile(series, lower_percentile)
    upper_limit = np.percentile(series, upper_percentile)
    series = series.clip(lower=lower_limit)
    series = series.clip(upper=upper_limit)
    
    return series

X_train['Criminal Case'] = clip_outliers(X_train['Criminal Case'])
X_test['Criminal Case'] = clip_outliers(X_test['Criminal Case'])

X_train

Unnamed: 0,Criminal Case,Total Assets,Liabilities,state_ARUNACHAL PRADESH,state_ASSAM,state_BIHAR,state_CHHATTISGARH,state_DELHI,state_GOA,state_GUJARAT,...,Party_NCP,Party_NDPP,Party_NPP,Party_RJD,Party_SHS,Party_SP,Party_Sikkim Krantikari Morcha,Party_TDP,Party_Tipra Motha Party,Party_YSRCP
0,4,273.306731,52.157616,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,99.120454,0.000000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,143.910728,38.400909,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,150.948339,38.878979,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2,113.279896,44.320029,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,1,90.066712,34.282478,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2055,0,113.279896,33.183858,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2056,0,161.847669,46.402055,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2057,1,183.140071,47.049678,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[['Total Assets', 'Liabilities']] = scaler.fit_transform(X_train[['Total Assets', 'Liabilities']])
X_test[['Total Assets', 'Liabilities']] = scaler.transform(X_test[['Total Assets', 'Liabilities']])

X_train

Unnamed: 0,Criminal Case,Total Assets,Liabilities,state_ARUNACHAL PRADESH,state_ASSAM,state_BIHAR,state_CHHATTISGARH,state_DELHI,state_GOA,state_GUJARAT,...,Party_NCP,Party_NDPP,Party_NPP,Party_RJD,Party_SHS,Party_SP,Party_Sikkim Krantikari Morcha,Party_TDP,Party_Tipra Motha Party,Party_YSRCP
0,4,2.871486,0.939314,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,-0.342322,-1.547292,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,0.484077,0.283465,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,0.613924,0.306257,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2,-0.081074,0.565658,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,1,-0.509367,0.087120,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2055,0,-0.081074,0.034743,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2056,0,0.815021,0.664919,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2057,1,1.207874,0.695794,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [4]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0],
    'binarize': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
    'fit_prior': [True, False],
    'class_prior': [None, [0.3, 0.7]]
}

bnb = BernoulliNB()
grid_search = GridSearchCV(bnb, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train, y_train)

best_bnb = grid_search.best_estimator_
y_pred = best_bnb.predict(X_test)
# f1 = f1_score(y_train, y_pred, average='weighted')
# print(f1)

y_pred

360 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nilay/Desktop/Intro-to-ML-and-DL/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nilay/Desktop/Intro-to-ML-and-DL/venv/lib/python3.9/site-packages/sklearn/naive_bayes.py", line 779, in fit
    self._update_class_log_prior(class_prior=class_prior)
  File "/Users/nilay/Desktop/Intro-to-ML-and-DL/venv/lib/python3.9/site-packages/sklearn/naive_bayes.py", line 595, in _update_class_log_prior
    raise ValueError("Number of priors must match number of

array(['10th Pass', '12th Pass', '10th Pass', ..., 'Graduate',
       '10th Pass', 'Post Graduate'], dtype='<U21')

In [5]:
result = pd.DataFrame({'ID': range(len(y_pred)), 'Education': y_pred})
result.to_csv('submission.csv', index=False)