In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler



In [4]:
test=pd.read_csv("test.csv")
train=pd.read_csv("train.csv")

In [7]:
train['Education'].value_counts()

Education
Graduate                 531
Post Graduate            432
12th Pass                349
Graduate Professional    339
10th Pass                227
8th Pass                  78
Doctorate                 52
Others                    28
Literate                  14
5th Pass                   9
Name: count, dtype: int64

In [8]:
education_mapping = {'Literate': 0, '5th Pass': 1, '8th Pass': 2, '10th Pass': 3, 
                     '12th Pass': 4, 'Graduate': 5, 'Post Graduate': 6, 
                     'Graduate Professional': 7, 'Doctorate': 8, 'Others': 9}

In [9]:
train.head()

Unnamed: 0,ID,Candidate,Constituency ∇,Party,Criminal Case,Total Assets,Liabilities,state,Education
0,0,M.K. Mohan,ANNA NAGAR,DMK,4,211 Crore+,2 Crore+,TAMIL NADU,8th Pass
1,1,Khatik Ramesh Prasad,KARERA (SC),BJP,0,1 Crore+,0,MADHYA PRADESH,12th Pass
2,2,Dr. Mantar Gowda,MADIKERI,INC,0,7 Crore+,22 Lac+,KARNATAKA,Post Graduate
3,3,Kundan Kumar,BEGUSARAI,BJP,0,9 Crore+,24 Lac+,BIHAR,Post Graduate
4,4,Swapan Majumder,BANGAON DAKSHIN (SC),BJP,2,2 Crore+,61 Lac+,WEST BENGAL,8th Pass


In [10]:
def PrepareData(data):
    data=data.drop(columns=['ID','Candidate','Constituency ∇'])
    data.iloc[:,[2,3]]=(data.iloc[:,[2,3]]).apply(lambda x: x.str.replace(' Crore+','00000'))
    data.iloc[:,[2,3]]=(data.iloc[:,[2,3]]).apply(lambda x: x.str.replace(' Lac+','000'))
    data.iloc[:,[2,3]]=(data.iloc[:,[2,3]]).apply(lambda x: x.str.replace(' Thou+','0'))
    data.iloc[:,[2,3]]=(data.iloc[:,[2,3]]).apply(lambda x: x.str.replace(' Hund+',''))
    return data

In [11]:
# Assuming X_train and y_train are your training features and labels
f1_scorer = make_scorer(f1_score, average='weighted')
accuracy_scorer=make_scorer(accuracy_score)

In [36]:
def Transform(data,eduEncoder,type='train'):
    scaler=StandardScaler()
    minmax=MinMaxScaler()
    encoder=OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    data[['Total Assets', 'Liabilities']] = scaler.fit_transform(data[['Total Assets', 'Liabilities']])
    # data=data.drop(columns=['Party'])
    # data[['Criminal Case']]=scaler.fit_transform(data[['Criminal Case']])
    # data['Party']=encoders[0].transform(data['Party'])
    # data['state']=encoders[1].transform(data['state'])
# Fit and transform the encoder
    encoded_features = encoder.fit_transform(data[['Party', 'state']])

    # Get the feature names from the encoder
    feature_names = encoder.get_feature_names_out(['Party', 'state'])

    # Create a DataFrame from the transformed data with the correct column names
    encoded_data = pd.DataFrame(encoded_features, columns=feature_names)

    data = pd.concat([data.drop(['Party', 'state'], axis=1), encoded_data], axis=1)
    data = data.astype({'Total Assets': float, 'Liabilities':float})
    data["Criminal Case"] = (data["Criminal Case"] > 0).astype(int) # binary variable
    data["Assets_liability"] =  (data['Total Assets']>data['Liabilities']).astype(int)# asset to liability ratio prop to education
    if(type=='test'):
        # data=pd.get_dummies(data).astype(int)
        # data = data.astype({'Party': int, 'Total Assets': float, 'Liabilities':float,'state':int})
        data=data.drop(columns=['Liabilities','Total Assets']) 
        return data
    # data['Education']=encoders[2].transform(data['Education'])
    data['Education']=data['Education'].map(eduEncoder)
    y=data['Education']
    # data = data.astype({'Total Assets': float, 'Liabilities':float,'Education':int})
    X=data.drop(columns=['Education','Liabilities','Total Assets']) 
    return X,y

In [37]:
def createSubmission(education_mapping,model,X_test):
    y_test=model.predict(X_test)
    df = pd.read_csv('test.csv')
    df = df[['ID']]
    # df['Education']=encoderEducation.classes_[y_test]
    # df['Education'] = [list(education_mapping.keys())[9] if i > 8 else list(education_mapping.keys())[i] for i in y_test]
    df['Education'] = [list(education_mapping.keys())[i] for i in y_test]

    # Save the DataFrame to a new CSV file
    df.to_csv('output.csv', index=False)
    return y_test

In [38]:
def getF1Score(model,X_val,y_val):
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred,average='weighted')
    print("F1 Score:", f1)  

In [39]:
def optimiseModel(model,X,y,param_grid):

    # Initialize GridSearchCV with F1 score as the scoring metric
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)
    # grid_search = RandomizedSearchCV(estimator = model, param_distributions = param_grid, cv = 5, scoring=f1_scorer, n_jobs = -1)
    # Perform grid search to find the best parameters
    grid_search.fit(X ,y)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print("Best Parameters:", best_params)
    print("Best F1 Score:", best_score)
    return best_params

    

In [40]:
test_data=PrepareData(test)
train_data=PrepareData(train)

In [41]:
train_data

Unnamed: 0,Party,Criminal Case,Total Assets,Liabilities,state,Education
0,DMK,4,21100000,200000,TAMIL NADU,8th Pass
1,BJP,0,100000,0,MADHYA PRADESH,12th Pass
2,INC,0,700000,22000,KARNATAKA,Post Graduate
3,BJP,0,900000,24000,BIHAR,Post Graduate
4,BJP,2,200000,61000,WEST BENGAL,8th Pass
...,...,...,...,...,...,...
2054,CPI,1,61000,10000,KERALA,Graduate Professional
2055,INC,0,200000,8000,RAJASTHAN,10th Pass
2056,BJP,0,1300000,85000,UTTAR PRADESH,Graduate
2057,NCP,1,2500000,94000,MAHARASHTRA,12th Pass


In [42]:
# test=pd.read_csv("test.csv")
# train=pd.read_csv("train.csv")

In [43]:
# train=PrepareData(train)
# test=PrepareData(test)

In [44]:
# train['Party'] = train['Party'].astype('category').cat.codes
# train['Education'] = train['Education'].astype('category').cat.codes
# train['state'] = train['state'].astype('category').cat.codes

# train = train.astype({'Party': int, 'Total Assets': float, 'Liabilities':float,'state':int,'Education':int})
# train.dtypes

In [45]:
# import pandas as pd
# from scipy.stats import chi2_contingency

# # Assume 'data' is your DataFrame with 'Party' and 'Education level' columns

# # Create a contingency table (cross-tabulation)
# contingency_table = pd.crosstab(train['Criminal Case'], train['Education'])


# # Perform the chi-squared test
# chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

# # Print the results
# print("Chi-squared statistic:", chi2_stat)
# print("p-value:", p_val)

In [46]:
encoderParty=LabelEncoder()
encoderState=LabelEncoder()
encoderEducation=LabelEncoder()
encoderParty.fit(train['Party'])
encoderState.fit(train['state'])
encoderEducation.fit(train['Education'])

In [47]:
X,y=Transform(train_data,education_mapping)

In [48]:
X

Unnamed: 0,Criminal Case,Party_AAP,Party_AIADMK,Party_AITC,Party_BJD,Party_BJP,Party_CPI,Party_CPI(M),Party_DMK,Party_INC,...,state_PUDUCHERRY,state_PUNJAB,state_RAJASTHAN,state_SIKKIM,state_TAMIL NADU,state_TRIPURA,state_UTTAR PRADESH,state_UTTARAKHAND,state_WEST BENGAL,Assets_liability
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2055,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2056,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
2057,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [49]:
X

Unnamed: 0,Criminal Case,Party_AAP,Party_AIADMK,Party_AITC,Party_BJD,Party_BJP,Party_CPI,Party_CPI(M),Party_DMK,Party_INC,...,state_PUDUCHERRY,state_PUNJAB,state_RAJASTHAN,state_SIKKIM,state_TAMIL NADU,state_TRIPURA,state_UTTAR PRADESH,state_UTTARAKHAND,state_WEST BENGAL,Assets_liability
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2055,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2056,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
2057,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [50]:
from imblearn.over_sampling import SMOTE

def oversample(X,y,sampling_strategy):
    # sampling_strategy = {
    #     5: 531,   # Keep the majority class unchanged
    #     6: 450, # 432/531
    #     4: 450, # 349/531
    #     7: 450, # 339/531
    #     3: 300, # 227/531
    #     2: 100, # 78/531
    #     8: 100, # 52/531
    #     9: 100, # 28/531
    #     0: 100, # 14/531
    #     1: 100, # 9/531
    # }

    smote = SMOTE(random_state=69,sampling_strategy=sampling_strategy)

    # Apply SMOTE to generate synthetic samples
    X_resampled, y_resampled = smote.fit_resample(X,y)

    return X_resampled,y_resampled



In [51]:
X_test=Transform(test_data,np.array((encoderParty,encoderState,encoderEducation)),'test')

In [52]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.15,random_state=69,stratify=y)

In [53]:
y_train.value_counts()

Education
5    451
6    367
4    297
7    288
3    193
2     66
8     44
9     24
0     12
1      8
Name: count, dtype: int64

In [54]:
# sampling_strategy = {
#     5: 451,   # Keep the majority class unchanged
#     6: 367, # 432/531
#     4: 297, # 349/531
#     7: 288, # 339/531
#     3: 250, # 227/531
#     2: 150, # 78/531
#     8: 125, # 52/531
#     9: 70, # 28/531
#     0: 50, # 14/531
#     1: 50, # 9/531
# }

In [55]:
sampling_strategy= {
    5: 451,   # Keep the majority class unchanged
    6: 367, # 432/531
    4: 297, # 349/531
    7: 288, # 339/531
    3: 250, # 227/531
    2: 150, # 78/531
    8: 125, # 52/531
    9: 75, # 28/531
    0: 50, # 14/531
    1: 50, # 9/531
}

In [56]:
X_resampled,y_resampled=oversample(X_train,y_train,sampling_strategy)

In [57]:
classifier = DecisionTreeClassifier()

# Fit the classifier to the training data
classifier.fit(X_train, y_train)

# # Evaluate the classifier on the testing data
# accuracy = classifier.score(X_val, y_val)
# print("Accuracy:", accuracy)

# Heirarchy

In [331]:
def heirarchialModel(heirarchy,X,y):

    broad_trained_models = {}
    y1=y.copy()

    # Dictionary to store trained models for predicting finer classes within each broad class
    finer_trained_models = {}
    # Train classification models for broad classes
    for broad_level, broad_classes in heirarchy.items():
        broad_subset_indices = y.isin(broad_classes)
        y1[broad_subset_indices]=broad_level
        # print(broad_classes,broad_subset_indices)
        X_train_broad = X[broad_subset_indices]
        y_train_broad = y[broad_subset_indices]
        # X_train, X_test, y_train, y_test = train_test_split(broad_subset_X, broad_subset_y, test_size=0.2, random_state=69, stratify=broad_subset_y)
        # clf = RandomForestClassifier(random_state=69)
        clf=optimiseRandomForest(X_train_broad,y_train_broad)
        clf.fit(X_train_broad, y_train_broad)
        broad_trained_models[broad_level] = clf

    BroadModel = optimiseGboost(X,y1)
    BroadModel.fit(X, y1)
    return BroadModel,broad_trained_models

In [344]:
def hierarchical_classification(X_val,y_val,BroadModel,broad_trained_models):
    y_predict1=BroadModel.predict(X_val)
    y_predict=np.zeros((len(y_predict1)))
    for i in range(0,len(y_predict1)):
        y_predict[i]=broad_trained_models[y_predict1[i]].predict(X_val.iloc[[i]])[0]
    f1 = f1_score(y_val, y_predict,average='weighted')
    print(y_predict)
    print("F1 Score:", f1)  
    return y_predict



In [342]:
heirarchy = {
    0: [1, 2, 3],
    1: [4],
    2: [5, 6, 7, 8],
    3: [0, 9]
}
BroadModel,broad_trained_models=heirarchialModel(heirarchy,X_train,y_train)
hierarchical_classification(X_val,y_val,BroadModel,broad_trained_models)


Best Parameters: {'max_leaf_nodes': 200, 'min_samples_split': 2, 'n_estimators': 150}
Best F1 Score: 0.6341748879358635
Best Parameters: {'max_leaf_nodes': 200, 'min_samples_split': 2, 'n_estimators': 50}
Best F1 Score: 1.0
Best Parameters: {'max_leaf_nodes': 200, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 Score: 0.3904616823342922
Best Parameters: {'max_leaf_nodes': 200, 'min_samples_split': 2, 'n_estimators': 150}
Best F1 Score: 0.5686147186147187
Best Parameters: {'learning_rate': 0.1, 'max_leaf_nodes': 200, 'n_estimators': 200}
Best F1 Score: 0.5307902964190554
F1 Score: 0.21980086607772314


array([5., 5., 4., 5., 5., 5., 5., 7., 5., 7., 7., 6., 5., 7., 7., 5., 6.,
       6., 6., 5., 8., 5., 7., 5., 5., 8., 5., 5., 7., 6., 7., 5., 6., 6.,
       6., 5., 5., 6., 6., 5., 6., 5., 5., 5., 5., 6., 5., 5., 5., 5., 5.,
       7., 6., 6., 6., 6., 2., 5., 0., 5., 5., 6., 5., 5., 6., 6., 7., 5.,
       5., 5., 6., 6., 5., 5., 7., 5., 6., 7., 6., 7., 4., 5., 7., 6., 6.,
       6., 5., 6., 4., 6., 6., 5., 7., 7., 7., 5., 7., 6., 5., 5., 7., 5.,
       7., 5., 5., 6., 7., 7., 5., 7., 5., 6., 7., 5., 6., 4., 6., 5., 5.,
       5., 7., 7., 5., 5., 5., 5., 5., 6., 6., 8., 6., 7., 5., 5., 6., 6.,
       5., 5., 6., 4., 5., 6., 5., 5., 6., 5., 6., 6., 6., 6., 5., 3., 6.,
       5., 6., 7., 6., 3., 6., 5., 5., 6., 7., 5., 5., 2., 5., 6., 5., 9.,
       4., 7., 5., 6., 7., 5., 6., 6., 6., 7., 5., 7., 6., 6., 5., 5., 6.,
       5., 6., 6., 3., 6., 6., 7., 7., 3., 5., 5., 6., 5., 2., 6., 7., 7.,
       7., 6., 4., 7., 6., 6., 5., 5., 5., 5., 6., 4., 7., 7., 3., 6., 5.,
       5., 6., 6., 6., 5.

In [350]:
y_t=np.ones(len(X_test))
y_test=hierarchical_classification(X_test,y_t,BroadModel,broad_trained_models)
y_test=y_test.astype(int)
df = pd.read_csv('test.csv')
df = df[['ID']]
df['Education'] = [list(education_mapping.keys())[i] for i in y_test]

# Save the DataFrame to a new CSV file
df.to_csv('output.csv', index=False)

[5. 7. 4. ... 5. 8. 5.]
F1 Score: 0.004357298474945534


# Decision Tree Classifier


In [195]:


dt_classifier=DecisionTreeClassifier(random_state=1)
dt_scoresA = cross_val_score(dt_classifier, X_train, y_train, cv=5,scoring=f1_scorer)
dt_scoresF = cross_val_score(dt_classifier, X_resampled, y_resampled, cv=5,scoring=f1_scorer)
print("Decision Tree Classifier Cross-Validation Scores, A:", dt_scoresA.mean()," B: ",dt_scoresF.mean() )


Decision Tree Classifier Cross-Validation Scores, A: 0.21117148875251415  B:  0.24014656713212693


In [183]:
def optimiseDecisionTree(X,y):
    # Define the parameter grid to search
    param_grid = {
        # 'criterion': ['gini', 'entropy'],
        # 'max_depth': [1, 2, 5, 10, 15],
        'min_samples_split': [2, 3, 5, 10],
        'min_samples_leaf': [1, 2,5, 10, 15],
        # 'max_features': ['auto', 'sqrt', 'log2'],
        'max_leaf_nodes':[200, 500,700,1000,1500,2000]
    }
    dt_classifier=DecisionTreeClassifier(random_state=1)
    best_params=optimiseModel(dt_classifier,X,y,param_grid)
    # Initialize GridSearchCV with F1 score as the scoring metric
    best_dt_classifier = DecisionTreeClassifier(random_state=1,**best_params)
    return best_dt_classifier

In [273]:
best_dt_classifier=optimiseDecisionTree(X_resampled,y_resampled)
best_dt_classifier.fit(X_resampled, y_resampled)
getF1Score(best_dt_classifier,X_val,y_val)

Best Parameters: {'max_leaf_nodes': 700, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best F1 Score: 0.2535454168806068
F1 Score: 0.19831961914027646


In [None]:
train=createSubmission(education_mapping,best_dt_classifier,X_test)


In [None]:
best_dt_classifier.fit(X, y)
full=createSubmission(education_mapping,best_dt_classifier,X_test)

# Random Forest Classifier

In [204]:
rf_classifier = RandomForestClassifier(random_state=69,oob_score=True)
rf_scoresA = cross_val_score(rf_classifier, X_train, y_train, cv=5,scoring=f1_scorer)
rf_scoresF = cross_val_score(rf_classifier, X_resampled, y_resampled, cv=5, scoring=f1_scorer)
print("Random Forest Classifier Cross-Validation Scores, A:", rf_scoresA.mean()," B: " ,rf_scoresF.mean())

Random Forest Classifier Cross-Validation Scores, A: 0.19813724271775218  B:  0.2642110347666645


In [272]:
def optimiseRandomForest(X,y) :
  # Define the parameter grid to search
  # param_grid = {
  #     'n_estimators': [100,500,1000],
  #     # 'criterion': ['gini', 'entropy'],
  #     # 'max_depth': [None, 10, 20],
  #     'min_samples_split': [5,7,9,11],
  #     # 'min_samples_leaf': [1, 2, 5,7,9],
  #     'max_leaf_nodes':[200,500,1000,2000]
  # }

  #     # 'max_features': ['auto', 'sqrt', 'log2']
  # # }
  param_grid = {
  'n_estimators': [50, 100, 150,200],
#   Consider both Gini impurity and entropy for split criterion
  # 'max_depth': [2, 5, 10, 20, 100],  # Allow the tree to grow deeper or limit its depth
  'min_samples_split': [2,5,7,9],  # Vary the minimum number of samples required to split an internal node
  'max_leaf_nodes':[200,500,700,1000]
  # 'min_samples_leaf': [1, 2, 5], 
  # 'min_impurity_decrease' : [0.001, 0.005, 0.0001] # Vary the minimum number of samples required to be a leaf node
  # 'max_features': ['auto', 'sqrt', 'log2'],  # Consider different ways of selecting features for splitting
  # 'max_leaf_nodes': [10, 100, 200, 500]  # Limit the maximum number of leaf nodes
  }
  # param_grid = {'n_estimators':[100,500,1000,1500],
  #               'max_leaf_nodes':[200,500,1000],
  #               'min_samples_split':[2,5,7,10,11]
  #             #   'max_depth': [None,2,5,10]
  #               }
  rf_classifier=RandomForestClassifier(random_state=69)
  best_params=optimiseModel(rf_classifier,X,y,param_grid)
  best_rf_classifier = RandomForestClassifier(random_state=69,**best_params)
  return best_rf_classifier

  

In [351]:
best_rf_classifier = optimiseRandomForest(X_resampled,y_resampled)
best_rf_classifier.fit(X_resampled, y_resampled)
getF1Score(best_rf_classifier,X_val,y_val)


Best Parameters: {'max_leaf_nodes': 200, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 Score: 0.28235528237771435
F1 Score: 0.23230705742940352


In [357]:
getF1Score(best_rf_classifier,X_val,y_val)


F1 Score: 0.5844660736615426


In [352]:
best_rf_classifier.fit(X, y)
createSubmission(education_mapping,best_rf_classifier,X_test)

array([5, 6, 5, ..., 5, 5, 6])

In [None]:
getF1Score(best_rf_classifier,X_val,y_val)


F1 Score: 0.18562478265451318


# Logistic Regression

In [276]:

# Logistic Regression
logistic_classifier = LogisticRegression(max_iter=10000)
logistic_scoresA = cross_val_score(logistic_classifier, X_train, y_train, cv=5,scoring=f1_scorer)
logistic_scoresF = cross_val_score(logistic_classifier, X_resampled, y_resampled, cv=5,scoring=f1_scorer)
print("Logistic Regression Cross-Validation Scores, A:", logistic_scoresA.mean()," B: ",logistic_scoresF.mean() )

Logistic Regression Cross-Validation Scores, A: 0.20275621636921418  B:  0.18389284739598938


In [281]:


# Define the parameter grid to search
param_grid = {
    'penalty': ['l1', "l2"],  # Regularization penalty
    'C': [5, 10, 100, 500, 600, 1000], # Inverse of regularization strength,
    "solver" : ["liblinear", "lbfgs"]
}


# Initialize GridSearchCV with F1 score as the scoring metric
grid_search = GridSearchCV(estimator=logistic_classifier, param_grid=param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)

# Perform grid search to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best F1 Score:", best_score)


Best Parameters: {'C': 5, 'penalty': 'l2', 'solver': 'lbfgs'}
Best F1 Score: 0.2051587348632707


30 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solv

In [282]:
# grid_search.fit(X_resampled, y_resampled)
getF1Score(grid_search,X_val,y_val)

F1 Score: 0.19813331177489987


# Support Vector Machine

In [300]:



# Support Vector Machine
svm_classifier = SVC(kernel='linear')
svm_scoresA = cross_val_score(svm_classifier, X_resampled, y_resampled, cv=5,scoring=accuracy_scorer)
svm_scoresF = cross_val_score(svm_classifier, X_train, y_train, cv=5,scoring=f1_scorer)
print("Support Vector Machine Cross-Validation Scores, A:", svm_scoresA.mean()," B: ",svm_scoresF.mean() )

Support Vector Machine Cross-Validation Scores, A: 0.23016400859631264  B:  0.19895867643669268


In [301]:
def optimiseSVM(X,y):
        
    param_grid = {
        'C': [0.1, 1, 10, 100],  
        'gamma': [1, 0.1, 0.01, 0.001], 
        'kernel': ['rbf', 'linear', 'poly']
    }

    # Initialize K-Nearest Neighbors Classifier
    svm_classifier = SVC(random_state=42)
    
    # Initialize GridSearchCV with F1 score as the scoring metric
    best_params=optimiseModel(svm_classifier,X,y,param_grid)
    best_knn_classifier = SVC(random_state=42,**best_params)

    return best_knn_classifier

In [304]:
best_svm_classifier = optimiseSVM(X_resampled,y_resampled)

best_svm_classifier.fit(X_train, y_train)
getF1Score(best_svm_classifier,X_val,y_val)



Best Parameters: {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
Best F1 Score: 0.2544695560370561
F1 Score: 0.2192395834962284


# K nearest neighbour

In [283]:
# KNN
def optimiseKNN(X,y):
    param_grid = {
        'n_neighbors': [3,4, 5,6, 9,11,15],  # Number of neighbors
        'weights': ['uniform', 'distance'],  # Weight function used in prediction
    }

    # Initialize K-Nearest Neighbors Classifier
    knn_classifier = KNeighborsClassifier()
    
    # Initialize GridSearchCV with F1 score as the scoring metric
    best_params=optimiseModel(knn_classifier,X,y,param_grid)
    best_knn_classifier = KNeighborsClassifier(**best_params)

    return best_knn_classifier


In [287]:
best_knn_classifier = optimiseKNN(X_resampled,y_resampled)

best_knn_classifier.fit(X_resampled, y_resampled)
getF1Score(best_knn_classifier,X_val,y_val)


Best Parameters: {'n_neighbors': 3, 'weights': 'distance'}
Best F1 Score: 0.2491321779105605
F1 Score: 0.21067885474632797


In [None]:
best_knn_classifier.fit(X_resampled, y_resampled)
createSubmission(education_mapping,best_knn_classifier,X_test)

array([4, 5, 4, ..., 5, 3, 5])

In [292]:
nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=13)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(X_resampled, y_resampled)
print(nca_pipe.score(X_val, y_val))

0.16828478964401294


In [295]:

# Define the parameter grid to search
param_grid = {
    'nca__n_components': [1,2,3],  # Number of components for NCA
    'knn__n_neighbors': [5,7,9,11],  # Number of neighbors for 
    'knn__p':[1,2],
    'knn__weights': ['uniform', 'distance'],  # Weight function used in prediction for KNN
}

# Create pipeline
nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier()
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])

# Initialize GridSearchCV with F1 score as the scoring metric
grid_search = GridSearchCV(estimator=nca_pipe, param_grid=param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)

# Perform grid search to find the best parameters
grid_search.fit(X_resampled, y_resampled)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best F1 Score:", best_score)


Best Parameters: {'knn__n_neighbors': 5, 'knn__p': 2, 'knn__weights': 'distance', 'nca__n_components': 3}
Best F1 Score: 0.22587835015826863


In [298]:
# Define parameters for NCA
nca_params = {'n_components': 3, 'random_state': 42}

# Define parameters for KNN
knn_params = {'n_neighbors': 5, 'weights': 'distance','p':2}

# Create the pipeline with parameters
nca_pipe = Pipeline([
    ('nca', NeighborhoodComponentsAnalysis(**nca_params)),
    ('knn', KNeighborsClassifier(**knn_params))
])
best_knn_classifier = nca_pipe
best_knn_classifier.fit(X_resampled, y_resampled)
getF1Score(best_knn_classifier,X_val,y_val)


F1 Score: 0.21284737526579203


In [None]:
best_knn_classifier.fit(X_resampled, y_resampled)
createSubmission(education_mapping,best_knn_classifier,X_test)

array([3, 5, 3, ..., 5, 3, 6])

# Gradient Boosting Classifier

In [58]:
# Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=69)
gb_scoresA = cross_val_score(gb_classifier, X_train, y_train, cv=5,scoring=f1_scorer)
gb_scoresF = cross_val_score(gb_classifier, X_resampled, y_resampled, cv=5,scoring=f1_scorer)
print("Gradient Boosting Classifier Cross-Validation Scores, A:", gb_scoresA.mean()," B: ",gb_scoresF.mean() )

Gradient Boosting Classifier Cross-Validation Scores, A: 0.19882844129312943  B:  0.24478875758451446


In [59]:
def optimiseGboost(X,y):
# Define the parameter grid to search
    param_grid = {
        'n_estimators': [100, 150,200],
        'learning_rate': [0.05, 0.01, 0.1, 0.2, 0.3],
        # 'max_depth': [1, 3, 5, 10, 20],
        # 'min_samples_split': [1, 5, 10],
        # 'min_samples_leaf': [2, 5, 10, 50],
        'max_leaf_nodes':[200,500,1000],
    }
   # Initialize K-Nearest Neighbors Classifier
    gb_classifier = GradientBoostingClassifier(random_state=69)
    
    # Initialize GridSearchCV with F1 score as the scoring metric
    best_params=optimiseModel(gb_classifier,X,y,param_grid)
    best_gb_classifier = GradientBoostingClassifier(random_state=69,**best_params)

    return best_gb_classifier

# Initialize GridSearchCV with F1 score as the scoring metric



In [60]:
best_gb_classifier = optimiseGboost(X_train,y_train)
best_gb_classifier.fit(X_train, y_train)
getF1Score(best_gb_classifier,X_val,y_val)

Best Parameters: {'learning_rate': 0.2, 'max_leaf_nodes': 200, 'n_estimators': 100}
Best F1 Score: 0.21468203688696635
F1 Score: 0.22363661916400157


In [62]:
best_gb_classifier.fit(X, y)
createSubmission(education_mapping,best_gb_classifier,X_test)

array([4, 4, 3, ..., 5, 8, 6])