In [None]:
import pandas as pd
from math import radians, sin, cos, sqrt, atan2
import numpy as np
from sklearn.cluster import DBSCAN
import os
import seaborn as sns
import matplotlib.pyplot as plt
import folium

In [None]:

df = pd.read_csv('/Users/carlsonoranu/Downloads/Election.csv')

In [None]:

df = df.drop(columns=['State', 'PU-Code', 'Results_Found', 'Result_Sheet_Unclear', 'Results_File', 'Result_Sheet_Unsigned'])

print(df.head(5))

In [None]:

coords = df[['Latitude', 'Longitude']].values

In [None]:
# Haversine formula
def haversine(coord1, coord2):
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    r = 6371  # Radius of Earth in kilometers
    return r * c

n = len(coords)
dist_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(i + 1, n):
        dist = haversine(coords[i], coords[j])
        dist_matrix[i, j] = dist
        dist_matrix[j, i] = dist  # Symmetric entry

In [None]:

dist_matrix_df = pd.DataFrame(dist_matrix)

In [None]:
# Define DBSCAN parameters
db = DBSCAN(eps=1, min_samples=2, metric='precomputed')

labels = db.fit_predict(dist_matrix)

df['Cluster'] = labels

In [None]:
# Function to calculate outlier scores, means, and standard deviations
def calculate_stats(df, party):
    grouped = df.groupby('Cluster')[party]
    means = grouped.transform('mean')
    stds = grouped.transform('std')
    
    outlier_scores = (df[party] - means) / stds
    outlier_scores = outlier_scores.fillna(0)  # accounting for any NaW values
    return outlier_scores, means, stds

In [None]:

for party in ['APC', 'LP', 'PDP', 'NNPP']:
    df[f'{party}_Outlier_Score'], df[f'{party}_Mean'], df[f'{party}_Std'] = calculate_stats(df, party)

In [None]:

sorted_dfs = {}
for party in ['APC', 'LP', 'PDP', 'NNPP']:
    sorted_dfs[party] = df.sort_values(by=f'{party}_Outlier_Score', ascending=False)

In [None]:
# Create a directory to save the cluster files
clusters_dir = '/Users/carlsonoranu/Downloads/clusters'
os.makedirs(clusters_dir, exist_ok=True)

In [None]:

output_file = os.path.join(clusters_dir, 'sorted_clusters.xlsx')
with pd.ExcelWriter(output_file) as writer:
    for party, sorted_df in sorted_dfs.items():
        sorted_df.to_excel(writer, sheet_name=party, index=False)

In [None]:

# Box plot for All Political Parties
df['Cluster'] = df['Cluster'].astype('category')

df['Cluster_Group'] = (df['Cluster'].astype(int) // 10) * 10

def plot_votes_by_cluster(party):
    plt.figure(figsize=(14, 7))
    sns.boxplot(x='Cluster_Group', y=party, data=df, palette="Set3")
    plt.title(f'Distribution of {party} Votes by Cluster Group')
    plt.xlabel('Cluster Group')
    plt.ylabel(f'{party} Votes')
    plt.xticks(rotation=90)
    plt.show()

# Plot for APC
plot_votes_by_cluster('APC')

# Plot for LP
plot_votes_by_cluster('LP')

# Plot for PDP
plot_votes_by_cluster('PDP')

# Plot for NNPP
plot_votes_by_cluster('NNPP')


In [None]:
import folium
from folium.plugins import MarkerCluster

# Initialize a map
m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=6)

# Add markers
marker_cluster = MarkerCluster().add_to(m)

# Iterate through the dataframe and add markers
for idx, row in df.iterrows():
    color = 'blue'  # Default color for clusters
    if abs(row['APC_Outlier_Score']) > 2 or abs(row['LP_Outlier_Score']) > 2 or abs(row['PDP_Outlier_Score']) > 2 or abs(row['NNPP_Outlier_Score']) > 2:
        color = 'red'  # Color for outliers
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=5,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=f'Cluster: {row["Cluster"]}<br>APC: {row["APC"]}<br>LP: {row["LP"]}<br>PDP: {row["PDP"]}<br>NNPP: {row["NNPP"]}'
    ).add_to(marker_cluster)

# Save the map to an HTML file
m.save('clusters_and_outliers_map.html')



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the mean votes by cluster
mean_votes = df.groupby('Cluster')[['APC', 'LP', 'PDP', 'NNPP']].mean().reset_index()

# Melt the dataframe for easier plotting
melted_votes = mean_votes.melt(id_vars='Cluster', var_name='Party', value_name='Votes')

# Plot the bar chart
plt.figure(figsize=(14, 7))
sns.barplot(x='Cluster', y='Votes', hue='Party', data=melted_votes, palette="Set3")
plt.title('Average Votes by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Average Votes')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Data Distribution
melted_z_scores = df.melt(id_vars='Cluster', value_vars=['APC_Outlier_Score', 'LP_Outlier_Score', 'PDP_Outlier_Score', 'NNPP_Outlier_Score'], 
                          var_name='Party', value_name='Outlier_Score')

# Plot the violin plot
plt.figure(figsize=(14, 7))
sns.violinplot(x='Party', y='Outlier_Score', data=melted_z_scores, palette="Set3")
plt.title('Outlier Score Distribution by Party')
plt.xlabel('Party')
plt.ylabel('Outlier_Score')
plt.show()
