# EDA & Prediction on Data Of India General Election 2019 
Data Source: (https://www.kaggle.com/datasets/prakrutchauhan/indian-candidates-for-general-election-2019)



### Installing required Python lib...

In [1]:
!pip install numpy
!pip install pandas



In [2]:
!pip install matplotlib
!pip install seaborn
!pip install plotly



In [4]:
!pip install sklearn
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2


### Importing required lib...

In [None]:
import sklearn as skl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objects as go
import plotly.offline as py
import plotly.express as px

py.init_notebook_mode(connected=True)
sns.set()

# Import for scaling the data
#from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder

# Import model evaluation metrics
#from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#from sklearn.model_selection import train_test_split, cross_val_score
#from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Plot should appear inside the jupyter notebook
%matplotlib inline


### 1.0) Importing & Analysing Data 

In [None]:
df_elec = pd.read_csv("Dataset/IndianElections/LS_2.0.csv")
df_elec.head()

In [None]:
df_elec.columns

In [None]:
#Rename columns
df_elec = df_elec.rename(columns={"CRIMINAL\nCASES": "criminal_cases", "GENERAL\nVOTES": "general_votes","POSTAL\nVOTES":"postal_votes","TOTAL\nVOTES":"total_votes"
                                 ,"OVER TOTAL ELECTORS \nIN CONSTITUENCY":"pct_over_total_electors_in_const"
                                 ,"OVER TOTAL VOTES POLLED \nIN CONSTITUENCY":"pct_over_total_vote_poll_in_const"})
df_elec.info()

In [None]:
df_elec.isnull().sum()

==> After Looking at columns, we can see there about 245 rows has NULL values in 8 columns. We will have to understand if there are any pattern

In [None]:
#understand where Symbol is not present
df_elec[df_elec.SYMBOL.isnull()==True]['NAME'].unique()

==> NOTA is obption where voter has right to use there vote but in case if s/he do not like any candidate from the list, they can put their vote in NOTA (None of The Above)

Since this is NOTA, You wont see details about candidate like Asset, Gender, Age, Education etc.

In [None]:
#This shows Symbol is NULL where there is NOTA - Let's Replace 
df_elec.loc[df_elec.NAME=='NOTA','SYMBOL'] = 'NoNoNo'

In [None]:
df_elec.loc[df_elec.NAME=='NOTA','SYMBOL']

In [None]:
df_elec[df_elec.NAME=='NOTA']

In [None]:
df_elec[df_elec.NAME!='NOTA']

In [None]:
#It seems whereever there is NOTA, Values are blank - Let's Fix them where possible
df_elec.loc[df_elec.NAME=='NOTA','GENDER'] = 'N/A'
df_elec.loc[df_elec.NAME=='NOTA','criminal_cases'] = 0
df_elec.loc[df_elec.NAME=='NOTA','AGE'] = 0
df_elec.loc[df_elec.NAME=='NOTA','CATEGORY'] = 'N/A'
df_elec.loc[df_elec.NAME=='NOTA','EDUCATION'] = 'N/A'
df_elec.loc[df_elec.NAME=='NOTA','ASSETS'] = 0
df_elec.loc[df_elec.NAME=='NOTA','LIABILITIES'] = 0
df_elec[df_elec.NAME=='NOTA']

In [None]:
df_elec.info()

In [None]:
df_elec.isnull().sum()

In [None]:
df_elec

### Use of Function & RegEx

In [None]:
# Let's clear the Asset & Liabilities data
# Remove Rs , \n and text
# Should only contain number
# Rs 1,28,78,51,556\n ~ 128 Crore+ => 1287851556
#

def clean_data(x):
    try:
        str_temp = re.sub(r"\D","",(x.split('\n')[0].strip()))
        #You can replace non digit chars => x.split('Rs')[1].split('\n')[0].strip()).replace(',','')
        return int(str_temp)
    except:
        x = 0
        return x
    
clean_data("Rs 1,28,78,51,556\n ~ 128 Crore+")

In [None]:
df_elec['ASSETS'] = df_elec['ASSETS'].apply((clean_data))
df_elec['LIABILITIES'] = df_elec['LIABILITIES'].apply((clean_data))
df_elec.head()

In [None]:
#Let's understand Education data
df_elec['EDUCATION'].unique()

In [None]:
df_elec['EDUCATION'].value_counts()

### Data Manipulation  

In [None]:
df_elec.EDUCATION.replace({'Post Graduate\n':'Post Graduate'},inplace=True)
# Any education level below 8th pass is illiterate
df_elec.EDUCATION.replace({'5th Pass':'Illiterate'},inplace=True)

# 'Graduate Professional' are Graduates, so replacing 'Graduate Professional' with 'Graduate'
df_elec['EDUCATION'].replace(to_replace='Graduate Professional', value='Graduate', inplace=True)
# 'Literate' = 8th Pass in our society
df_elec['EDUCATION'].replace(to_replace='Literate', value='8th Pass', inplace=True)
df_elec['EDUCATION'].replace(to_replace='N/A', value='Not Available', inplace=True)

df_elec.EDUCATION.unique()

In [None]:
#Let's understand category data
df_elec['CATEGORY'].unique()

### Checking for Duplicates and remove if required! 

In [None]:
#Lets see if we have duplicate based on State+Constituency+Party. I want to check there should be single candidate from party.
duplicate = df_elec[df_elec.duplicated(['STATE','CONSTITUENCY','PARTY'])]
duplicate
#we do have duplicates and that is because there are multiple Independent candidate (Party='IND') Which is valid scenario
#duplicate.PARTY.unique()

In [None]:
#Lets see if we have duplicate based on State+Constituency+Party Excluding IND candidates. I want to check there should be single candidate from party for constituency.
dups_chk = df_elec[df_elec.PARTY!='IND']
duplicate = dups_chk[dups_chk.duplicated(['STATE','CONSTITUENCY','PARTY'])]
duplicate

=> This proves there are no duplicate based on key!

### Use of Agg. Functions and adding columns in DF 

In [None]:
#Let's Add column using Group by to find Total Votes casted in constituency
df_elec["tot_vote_casted"] = df_elec.groupby(["STATE","CONSTITUENCY"])["total_votes"].transform('sum')
df_elec

In [None]:
#Fixing criminal_cases column
df_elec['criminal_cases'].replace(to_replace='Not Available', value='0', inplace=True)
df_elec['criminal_cases'] = df_elec['criminal_cases'].astype(int)
df_elec["criminal_cases_by_party"] = df_elec.groupby(["STATE","CONSTITUENCY","PARTY"])["criminal_cases"].transform('sum')

df_elec.info()

### I think we have good clean data now, Let's try to find some insights

In [None]:
#Lets see different spread across State for winning candidate
winners = df_elec[df_elec['WINNER'] == 1]

#pie(x, explode, labels, colors, autopct, pctdistance, shadow, labeldistance, startangle, radius, 
#counterclock, wedgeprops, textprops, center, frame, rotatelabels, normalize, data)

chart_labels = winners['STATE'].unique() 
plt.pie(winners['STATE'].value_counts(), labels=chart_labels, radius=7)
plt.show()

In [None]:
#For Criminal Cases

df_criminal_cases = df_elec.loc[(df_elec['criminal_cases'].notnull()) & (df_elec['criminal_cases'] != 'Not Available')]
def criminal_cases(row):
    if row['criminal_cases'] == 0:
        return 'No'
    else:
        return 'Yes'
df_criminal_cases['HAS CRIMINAL CASE'] = df_criminal_cases.apply(criminal_cases,axis = 1)
df_criminal_cases_count = df_criminal_cases.groupby(['HAS CRIMINAL CASE','WINNER']).size().reset_index()
df_criminal_cases_count.columns = ['HAS CRIMINAL CASE','WINNER','COUNT']

fig = px.bar(df_criminal_cases_count, x="HAS CRIMINAL CASE", y="COUNT", color='WINNER')
fig.show()

==> This shows voters do not like candidate with criminal cases but in many cases they have won!

In [None]:
# For Education
plt.figure(figsize=(19,10))
sns.countplot(x=df_elec['EDUCATION'],hue=df_elec['WINNER'])

==> This shows voters like Educated representatives!

### 2.0) Labeling and Preparing Data for model

In [None]:
#Let's try to understand if there is corelation between different fetures
# plotting correlation heatmap
dataplot = sns.heatmap(df_elec.corr(), cmap="YlGnBu",annot=True)
  
# displaying heatmap
plt.show()

In [None]:
# Labeling parties with less than 10 candidates as others
smallcnt_candidates = []
for i, j in df_elec['PARTY'].value_counts().items():
    if j <= 10:
        smallcnt_candidates.append(i)
len(smallcnt_candidates)

def other_party(data):
    if data in smallcnt_candidates:
        return 'Other'
    return data
df_elec['PARTY'] = df_elec['PARTY'].apply(other_party)
df_elec['PARTY'].value_counts()

In [None]:
df_elec.info()

In [None]:
df1 = df_elec[['STATE','CONSTITUENCY','WINNER','GENDER','criminal_cases','AGE','CATEGORY','EDUCATION','total_votes','TOTAL ELECTORS','ASSETS','LIABILITIES']]
txt_cols = ['STATE','CONSTITUENCY','GENDER','CATEGORY','EDUCATION']
num_cols = ['criminal_cases','AGE','total_votes','TOTAL ELECTORS','ASSETS','LIABILITIES']

In [None]:
df_winner = df1['WINNER'].value_counts().reset_index()
df_winner.columns = ['RESULT','COUNT']
pie = go.Pie(labels=df_winner['RESULT'], values=df_winner['COUNT'])
layout = go.Layout(title='Total Candidates vs Winners')
fig = go.Figure(data=[pie], layout=layout)
py.iplot(fig)

### Data Scalling & Up Sampling

In [None]:
dataset = pd.get_dummies(df1, columns = txt_cols)
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
columns_to_scale = num_cols
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
dataset.head()

In [None]:
from sklearn.utils import resample
df_majority = dataset[dataset.WINNER == 0]
df_minority = dataset[dataset.WINNER == 1]
df_minority_upsampled = resample(df_minority, replace = True,n_samples = 1452, random_state = 0) 
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.WINNER.value_counts()

### Choosing an appropriate model 

### Model 

In [None]:
y = df_upsampled['WINNER']
X = df_upsampled.drop(['WINNER'], axis = 1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
rfc_scores = []
for k in range(1,21):
    randomforest_classifier= RandomForestClassifier(n_estimators=k,random_state=0)
    score=cross_val_score(randomforest_classifier,X,y,cv=10)
    rfc_scores.append(score.mean())
plt.figure(figsize =(20,7))
plt.plot([k for k in range(1, 21)], rfc_scores, color = 'red')
for i in range(1,21):
    plt.text(i, rfc_scores[i-1], (i, round(rfc_scores[i-1],3)))
plt.xticks([i for i in range(1, 21)])
plt.xlabel('Number of Estimators (K)')
plt.ylabel('Scores')
plt.title('Random Forest Classifier scores for different K values')

In [None]:
#As we can see from the graph accuracy is maximum at k =14/16. Hence we will be selecting n_estimators=14.
randomforest_classifier= RandomForestClassifier(n_estimators=14,random_state=0)
score=cross_val_score(randomforest_classifier,X,y,cv=10)
print('% Accuracy :', round(score.mean()*100,4))
