In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
import os

In [30]:
# setting path for the relevant locations

current_dir = os.getcwd()                      # current working directory
parent_dir = os.path.dirname(current_dir)      # parent directory
data_set = parent_dir+'/data'                  # setting the path for data directory

In [31]:
#Reading the datasets and removing the 'Cost of Construction' feature as it doesn't give well separated classes
train = pd.read_csv(data_set + '/ClassificationTrain.csv').drop(['Cost'], axis = 1)
test = pd.read_csv(data_set + '/ClassificationTest.csv').drop(['Cost'], axis = 1)

In [32]:
#Including the 'Cost of Maintainence' feature which gives well separated classes
df = pd.read_csv(data_set + '/Cost_of_maintainence.csv')
costMaintainence = df.groupby('District')['No. of Dugwells according to the annual cost of maintenance - Upto Rs. 1000',
       'No. of Dugwells according to the annual cost of maintenance - Rs.1000 to 10000 ',
       'No. of Dugwells according to the annual cost of maintenance - Rs.10000 to 50000',
       'No. of Dugwells according to the annual cost of maintenance - Rs.50000 to 100000',
       'No. of Dugwells according to the annual cost of maintenance - More than Rs. 100000'].sum()
costMaintainence['total'] = costMaintainence['No. of Dugwells according to the annual cost of maintenance - Upto Rs. 1000'] + costMaintainence['No. of Dugwells according to the annual cost of maintenance - Rs.1000 to 10000 '] + costMaintainence['No. of Dugwells according to the annual cost of maintenance - Rs.10000 to 50000'] + costMaintainence['No. of Dugwells according to the annual cost of maintenance - Rs.50000 to 100000'] + costMaintainence['No. of Dugwells according to the annual cost of maintenance - More than Rs. 100000']  
costMaintainence['total_cost'] = 1000*costMaintainence['No. of Dugwells according to the annual cost of maintenance - Upto Rs. 1000'] + 5500*costMaintainence['No. of Dugwells according to the annual cost of maintenance - Rs.1000 to 10000 '] + 30000*costMaintainence['No. of Dugwells according to the annual cost of maintenance - Rs.10000 to 50000'] + 75000*costMaintainence['No. of Dugwells according to the annual cost of maintenance - Rs.50000 to 100000'] + 100000*costMaintainence['No. of Dugwells according to the annual cost of maintenance - More than Rs. 100000']
costMaintainence['avg_cost'] = costMaintainence['total_cost']/costMaintainence['total']

In [33]:
#Including the 'Mechanical Break Down - No.' feature to get better classes
df1 = pd.read_csv(data_set + '/Constraints.csv')
breakdown = df1.groupby('District')['Mechanical Break Down - No.'].sum()

In [34]:
#Merging the train antest datasets with the new features
trainBreakdown = train.merge(breakdown, on = 'District')
trainUpdated = trainBreakdown.merge(costMaintainence['avg_cost'], on = 'District')
testBreakdown = test.merge(breakdown, on = 'District')
testUpdated = testBreakdown.merge(costMaintainence['avg_cost'], on = 'District')

In [35]:
#Normalizing the 'Mechanical Break Down - No.' feature
max_breakdown = trainUpdated['Mechanical Break Down - No.'].max()
min_breakdown = trainUpdated['Mechanical Break Down - No.'].min()
diff = max_breakdown - min_breakdown
trainUpdated['Mechanical Break Down - No.'] = (trainUpdated['Mechanical Break Down - No.']-min_breakdown)/diff
max_breakdown = testUpdated['Mechanical Break Down - No.'].max()
min_breakdown = testUpdated['Mechanical Break Down - No.'].min()
diff = max_breakdown - min_breakdown
testUpdated['Mechanical Break Down - No.'] = (testUpdated['Mechanical Break Down - No.']-min_breakdown)/diff

In [36]:
#Binning the 'Cost of maintainence feature' into 2 classes: <7500 and >7500 i.e. low cost and high cost
bins = [0, 7500]
names = ['<7500', '>7500']
d = dict(enumerate(names, 1))
trainUpdated['Cost_Class'] = np.vectorize(d.get)(np.digitize(trainUpdated['avg_cost'], bins))
testUpdated['Cost_Class'] = np.vectorize(d.get)(np.digitize(testUpdated['avg_cost'], bins))

In [37]:
#Labelling low cost class as 0 and high cost class as 1
trainFinal = trainUpdated.replace('<7500',0).replace('>7500',1).drop(['avg_cost'], axis = 1)
testFinal = testUpdated.replace('<7500',0).replace('>7500',1).drop(['avg_cost'], axis = 1)

In [43]:
#Calculating the mean and standard deviation for each feature in each class
train_0 = trainFinal.loc[trainFinal['Cost_Class'] == 0].drop(['Cost_Class','District'], axis = 1)
train_1 = trainFinal.loc[trainFinal['Cost_Class'] == 1].drop(['Cost_Class','District'], axis = 1)
stats_0 = []
stats_1 = []
for (columnName, columnData) in train_0.iteritems():
    column_stat = [np.mean(columnData), np.std(columnData)]
    stats_0.append(column_stat)
for (columnName, columnData) in train_1.iteritems():
    column_stat = [np.mean(columnData), np.std(columnData)]
    stats_1.append(column_stat)

TypeError: Could not convert HIGHMEDIUMHIGHMEDIUMHIGHHIGHMEDIUMLOWLOWMEDIUMHIGHHIGHMEDIUMHIGHMEDIUMHIGHMEDIUMHIGHLOWLOWMEDIUMMEDIUMHIGHLOWLOWLOWMEDIUMMEDIUMHIGHMEDIUMMEDIUMHIGHMEDIUMHIGHHIGHHIGHLOWHIGHMEDIUMLOWHIGHMEDIUMHIGHMEDIUMMEDIUMMEDIUMLOWHIGHHIGHMEDIUMLOWLOWMEDIUMMEDIUMMEDIUMMEDIUMHIGHMEDIUMHIGHHIGHLOWHIGHMEDIUMMEDIUMMEDIUMMEDIUMLOWMEDIUMLOWMEDIUMMEDIUMHIGHLOWMEDIUMMEDIUMMEDIUMHIGHLOWMEDIUMLOWHIGHMEDIUMMEDIUMHIGHLOWMEDIUMMEDIUMLOWMEDIUMMEDIUMMEDIUMHIGHMEDIUMLOWMEDIUMMEDIUMHIGHMEDIUMMEDIUMLOWLOWLOWHIGHHIGHHIGHHIGHLOWMEDIUMMEDIUMMEDIUMMEDIUMHIGHMEDIUMHIGHHIGHLOWMEDIUMMEDIUMLOWHIGHMEDIUMMEDIUMMEDIUMMEDIUMLOWMEDIUMHIGHMEDIUMHIGHLOWLOWMEDIUMMEDIUMMEDIUMLOWLOWMEDIUMMEDIUMHIGHMEDIUMMEDIUMHIGHMEDIUMHIGHHIGHMEDIUMMEDIUMHIGHMEDIUMMEDIUMMEDIUMHIGHMEDIUMMEDIUMHIGHHIGHMEDIUMHIGHHIGHLOWHIGHLOWMEDIUMLOWHIGHHIGHHIGHHIGHHIGH to numeric

In [39]:
#Calcualting probablility of class 0 and class 1
p_0 = float(len(train_0))/len(trainFinal)
p_1 = float(len(train_1))/len(trainFinal)

In [40]:
#Function to find the probabilty of a test point belonging to class 0
def p0_x(stats_0, x, p_0):
    px_0 = 1
    j=0
    for i in stats_0:
        px_0 *= stats.norm.pdf(x[j], i[0], i[1])
        j+=1
    px_0 *= p_0
    return px_0

In [41]:
##Function to find the probabilty of a test point belonging to class 1
def p1_x(stats_1, x, p_1):
    px_1 = 1
    j=0
    for i in stats_1:
        px_1 *= stats.norm.pdf(x[j], i[0], i[1])
        j+=1
    px_1 *= p_1
    return px_1

In [42]:
#Testing the developed model on test data. F1 score is used as a measure of accuracy of the classifer. 
testData = testFinal.drop(['Cost_Class','District'],axis = 1).to_numpy()
testDataClass = testFinal['Cost_Class'].copy().to_numpy()
truePredicted = 0
truePositive = 0
totalTruePositive = 0
for i in range(len(testData)):
    p0 = p0_x(stats_0, testData[i], p_0)
    p1 = p1_x(stats_1, testData[i], p_1)
    if p0>p1:
        predictedClass = 0
    else:
        predictedClass = 1
    if predictedClass==testDataClass[i]:
        truePredicted +=1
    if predictedClass== testDataClass[i] and predictedClass ==0:
        truePositive +=1
    if testDataClass[i]==0:
        totalTruePositive +=1
precision = float(truePositive)/truePredicted
recall = float(truePositive)/totalTruePositive
f1_score = 2*precision*recall/(precision+recall)
print("Precision = %f" %precision)
print("Recall = %f" %recall)
print("F1 score = %f" %f1_score)

Precision = 1.000000
Recall = 1.000000
F1 score = 1.000000
