In [289]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


trainData=pd.read_csv("./train.csv",header=0)
testData=pd.read_csv("./test.csv",header=0)

### converting liabilities and assets to int

In [290]:
# Function to convert the 'Total Assets' column to float in Crore
def convert_to_crore(value):
    if 'Crore+' in value:
        return float(value.replace(' Crore+', ''))
    elif 'Lac+' in value:
        return float(value.replace(' Lac+', '')) / 100
    elif 'Thou+' in value:
        return float(value.replace(' Thou+', '')) / 10000
    elif 'Hund+' in value:
        return float(value.replace(' Hund+', '')) / 100000
    else:
        return float(value)
    
# Apply the conversion function to the 'Total Assets' column
trainData['Total Assets'] = trainData['Total Assets'].apply(convert_to_crore)
testData['Total Assets'] = testData['Total Assets'].apply(convert_to_crore)

trainData['Liabilities'] = trainData['Liabilities'].apply(convert_to_crore)
testData['Liabilities'] = testData['Liabilities'].apply(convert_to_crore)

In [291]:
trainData["Criminal Case"]=trainData["Criminal Case"]/trainData["Criminal Case"].max()
testData["Criminal Case"]=testData["Criminal Case"]/testData["Criminal Case"].max()

trainData["Total Assets"]=trainData["Total Assets"]/trainData["Total Assets"].max()
testData["Total Assets"]=testData["Total Assets"]/testData["Total Assets"].max()

trainData["Liabilities"]=trainData["Liabilities"]/trainData["Liabilities"].max()
testData["Liabilities"]=testData["Liabilities"]/testData["Liabilities"].max()

trainData.head()

Unnamed: 0,ID,Candidate,Constituency ∇,Party,Criminal Case,Total Assets,Liabilities,state,Education
0,0,M.K. Mohan,ANNA NAGAR,DMK,0.045977,0.166535,0.00227,TAMIL NADU,8th Pass
1,1,Khatik Ramesh Prasad,KARERA (SC),BJP,0.0,0.000789,0.0,MADHYA PRADESH,12th Pass
2,2,Dr. Mantar Gowda,MADIKERI,INC,0.0,0.005525,0.00025,KARNATAKA,Post Graduate
3,3,Kundan Kumar,BEGUSARAI,BJP,0.0,0.007103,0.000272,BIHAR,Post Graduate
4,4,Swapan Majumder,BANGAON DAKSHIN (SC),BJP,0.022989,0.001579,0.000692,WEST BENGAL,8th Pass


### Encoding states and parties 

In [292]:

total_states = trainData["state"].unique()
total_parties = trainData["Party"].unique()

# Create total_states columns
for state in total_states:
    trainData[state] = (trainData["state"] == state).astype(bool)
    testData[state] = (testData["state"] == state).astype(bool)

# Create total_parties columns
for party in total_parties:
    trainData[party] = (trainData["Party"] == party).astype(bool)
    testData[party] = (testData["Party"] == party).astype(bool)


trainData.head()


Unnamed: 0,ID,Candidate,Constituency ∇,Party,Criminal Case,Total Assets,Liabilities,state,Education,TAMIL NADU,...,CPI(M),NCP,TDP,NDPP,CPI,Sikkim Krantikari Morcha,JD(U),JMM,JD(S),Tipra Motha Party
0,0,M.K. Mohan,ANNA NAGAR,DMK,0.045977,0.166535,0.00227,TAMIL NADU,8th Pass,True,...,False,False,False,False,False,False,False,False,False,False
1,1,Khatik Ramesh Prasad,KARERA (SC),BJP,0.0,0.000789,0.0,MADHYA PRADESH,12th Pass,False,...,False,False,False,False,False,False,False,False,False,False
2,2,Dr. Mantar Gowda,MADIKERI,INC,0.0,0.005525,0.00025,KARNATAKA,Post Graduate,False,...,False,False,False,False,False,False,False,False,False,False
3,3,Kundan Kumar,BEGUSARAI,BJP,0.0,0.007103,0.000272,BIHAR,Post Graduate,False,...,False,False,False,False,False,False,False,False,False,False
4,4,Swapan Majumder,BANGAON DAKSHIN (SC),BJP,0.022989,0.001579,0.000692,WEST BENGAL,8th Pass,False,...,False,False,False,False,False,False,False,False,False,False


### Encoding Education column of trainData


In [293]:

mapper = {'Others':0 ,'Literate': 1, '5th Pass': 2, '8th Pass': 3, '10th Pass': 4, '12th Pass': 5, 'Graduate': 6, 'Post Graduate': 7, 'Graduate Professional': 8, 'Doctorate': 9}
reverse_mapper = {v: k for k, v in mapper.items()}

trainData['Education'] = trainData['Education'].map(mapper)

In [294]:
# Get unique entries in 'Education' column
unique_entries = sorted(trainData['Education'].unique())

# Create new columns based on unique entries
for i, entry in enumerate(unique_entries[:-1]):  # We exclude the last entry as there's no greater value
    trainData[f'Education_gt_{entry}'] = (trainData['Education'] > entry).astype(int)

trainData.head()

Unnamed: 0,ID,Candidate,Constituency ∇,Party,Criminal Case,Total Assets,Liabilities,state,Education,TAMIL NADU,...,Tipra Motha Party,Education_gt_0,Education_gt_1,Education_gt_2,Education_gt_3,Education_gt_4,Education_gt_5,Education_gt_6,Education_gt_7,Education_gt_8
0,0,M.K. Mohan,ANNA NAGAR,DMK,0.045977,0.166535,0.00227,TAMIL NADU,3,True,...,False,1,1,1,0,0,0,0,0,0
1,1,Khatik Ramesh Prasad,KARERA (SC),BJP,0.0,0.000789,0.0,MADHYA PRADESH,5,False,...,False,1,1,1,1,1,0,0,0,0
2,2,Dr. Mantar Gowda,MADIKERI,INC,0.0,0.005525,0.00025,KARNATAKA,7,False,...,False,1,1,1,1,1,1,1,0,0
3,3,Kundan Kumar,BEGUSARAI,BJP,0.0,0.007103,0.000272,BIHAR,7,False,...,False,1,1,1,1,1,1,1,0,0
4,4,Swapan Majumder,BANGAON DAKSHIN (SC),BJP,0.022989,0.001579,0.000692,WEST BENGAL,3,False,...,False,1,1,1,0,0,0,0,0,0


### Training a model for each created classification using LogisticRegression

In [295]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

# Selecting features and target variable
features = trainData.copy()

temparr1 = [f'Education_gt_{entry}' for entry in unique_entries[:-1]]
temparr2 = temparr1 + ['ID','Candidate','Constituency ∇', 'Party', 'state', 'Education']

features.drop(temparr2, axis=1, inplace=True)

target = trainData[temparr1]

In [296]:

# List to store models
models = []
f1_scores = []

# Create and train a model for each 'Education_gt_' column
for i, entry in enumerate(unique_entries[:-1]):
    # Get target column
    target_col = f'Education_gt_{entry}'

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, target[target_col], test_size=0.1, random_state=42)


    # Create a logistic regression model
    model = LogisticRegression(max_iter=1000)

    # Fit the model
    model.fit(features, target[target_col])

    # Make predictions
    y_pred = model.predict(X_test)

    # store the f1 score
    f1_scores.append(f1_score(y_test, y_pred))

    # Store the model
    models.append(model)

print(f1_scores)

[0.9975669099756691, 0.9901960784313726, 0.9877149877149877, 0.9622166246851386, 0.8954423592493298, 0.7678018575851393, 0.24761904761904763, 0.0, 0.0]


In [297]:
testData.head()

Unnamed: 0,ID,Candidate,Constituency ∇,Party,Criminal Case,Total Assets,Liabilities,state,TAMIL NADU,MADHYA PRADESH,...,CPI(M),NCP,TDP,NDPP,CPI,Sikkim Krantikari Morcha,JD(U),JMM,JD(S),Tipra Motha Party
0,0,Geeta Bharat Jain,MEERA BHAYANDAR,IND,0.011561,0.04954,0.038869,MAHARASHTRA,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,Becharam Manna,SINGUR,AITC,0.00578,0.001415,0.000459,WEST BENGAL,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,Sunil Vijay Tingre,VADGAON SHERI,NCP,0.017341,0.034678,0.003534,MAHARASHTRA,False,False,...,False,True,False,False,False,False,False,False,False,False
3,3,Asit Mazumder (Tapan),CHUNCHURA,AITC,0.00578,0.001415,0.0,WEST BENGAL,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,Hriday Narayan Singh Patel,SAGRI,SP,0.0,0.011323,0.007067,UTTAR PRADESH,False,False,...,False,False,False,False,False,False,False,False,False,False


### Predicting the probabilities for each Classification

In [298]:
features = testData.copy()

temparr3 = ['ID','Candidate','Constituency ∇', 'Party', 'state']

features.drop(temparr3, axis=1, inplace=True)

features.head()

Unnamed: 0,Criminal Case,Total Assets,Liabilities,TAMIL NADU,MADHYA PRADESH,KARNATAKA,BIHAR,WEST BENGAL,UTTAR PRADESH,PUNJAB,...,CPI(M),NCP,TDP,NDPP,CPI,Sikkim Krantikari Morcha,JD(U),JMM,JD(S),Tipra Motha Party
0,0.011561,0.04954,0.038869,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0.00578,0.001415,0.000459,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0.017341,0.034678,0.003534,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,0.00578,0.001415,0.0,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0.0,0.011323,0.007067,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [299]:
# Create a DataFrame to store probabilities
probabilities = pd.DataFrame()

# Predict probabilities for each model
for i, model in enumerate(models):
    # feature names from the data used for predictions
    X_final = features

    # Predict probabilities
    proba = model.predict_proba(X_final)

    # Get the probability of the positive class
    proba = proba[:, 1]

    # Store the probabilities in the DataFrame
    probabilities[f'Education_gt_{unique_entries[i]}'] = proba

probabilities.head()

Unnamed: 0,Education_gt_0,Education_gt_1,Education_gt_2,Education_gt_3,Education_gt_4,Education_gt_5,Education_gt_6,Education_gt_7,Education_gt_8
0,0.994839,0.994247,0.991913,0.979038,0.785092,0.550997,0.343892,0.137773,0.010802
1,0.996634,0.996926,0.996936,0.948478,0.853159,0.631902,0.382654,0.18562,0.019809
2,0.994542,0.993968,0.991883,0.978846,0.798083,0.572906,0.305315,0.175801,0.010504
3,0.996634,0.996926,0.996936,0.948476,0.853163,0.631896,0.382634,0.185638,0.01981
4,0.997524,0.996769,0.99704,0.990566,0.933511,0.765933,0.513721,0.198188,0.008913


In [300]:
# Create a new DataFrame 'final' with one more column than 'probabilities'
final_cols = list(probabilities.columns) + [f'Education_gt_{unique_entries[-1]}']
final = pd.DataFrame(index=probabilities.index, columns=final_cols)

# Set values for the first column
final.iloc[:, 0] = 1 - probabilities.iloc[:, 0]

# Set values for the intermediate columns
for i in range(1, len(probabilities.columns)):
    final.iloc[:, i] = probabilities.iloc[:, i-1] - probabilities.iloc[:, i]

# Set values for the last column
final.iloc[:, -1] = probabilities.iloc[:, -1]

final.head()


Unnamed: 0,Education_gt_0,Education_gt_1,Education_gt_2,Education_gt_3,Education_gt_4,Education_gt_5,Education_gt_6,Education_gt_7,Education_gt_8,Education_gt_9
0,0.005161,0.000592,0.002334,0.012875,0.193945,0.234095,0.207105,0.206119,0.126971,0.010802
1,0.003366,-0.000292,-1e-05,0.048457,0.095319,0.221257,0.249248,0.197033,0.165811,0.019809
2,0.005458,0.000574,0.002085,0.013036,0.180763,0.225177,0.267591,0.129514,0.165297,0.010504
3,0.003366,-0.000292,-1e-05,0.048459,0.095314,0.221267,0.249263,0.196995,0.165828,0.01981
4,0.002476,0.000754,-0.000271,0.006474,0.057056,0.167578,0.252212,0.315533,0.189275,0.008913


In [301]:
# Create a DataFrame with a single column 'Education'
final_df=pd.DataFrame()
final_df["ID"]=testData["ID"] # changed
final_df["Education"] = final.idxmax(axis=1).to_frame()

final_df.head()

Unnamed: 0,ID,Education
0,0,Education_gt_5
1,1,Education_gt_6
2,2,Education_gt_6
3,3,Education_gt_6
4,4,Education_gt_7


In [302]:
# Reverse the mapper dictionary
reverse_mapper = {v: k for k, v in mapper.items()}

# Modify the keys in the reverse_mapper dictionary
modified_mapper = {f'Education_gt_{k}': v for k, v in reverse_mapper.items()}

# Map the entries of the 'Education' column using the modified mapper
final_df['Education'] = final_df['Education'].map(modified_mapper)

final_df.head()

Unnamed: 0,ID,Education
0,0,12th Pass
1,1,Graduate
2,2,Graduate
3,3,Graduate
4,4,Post Graduate


In [303]:
comp_submission = pd.read_csv("../answers.csv",header=0)

final_df[final_df["Education"]==comp_submission["Education"]].shape

(390, 2)

In [304]:
final_df.to_csv('submission.csv', index=False)