In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import numpy as np

# Data Preparation: One-Hot Encoding of categorical features
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(data[['Category', 'Workplace', 'Type']])

# Determine Optimal k using the Elbow Method
inertia = []
k_values = range(1, 11)  # Testing k values from 1 to 10

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(encoded_features)
    inertia.append(kmeans.inertia_)

# Plotting the elbow plot
plt.figure(figsize=(10, 6))
plt.plot(k_values, inertia, '-o')
plt.title('Elbow Method to Determine Optimal k')
plt.xlabel('Number of Clusters, k')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)
plt.show()

In [ ]:
# Performing K-Means clustering with k=4
kmeans_final = KMeans(n_clusters=4, random_state=42)
kmeans_final.fit(encoded_features)

# Adding the cluster labels to the original dataset for analysis
data['Cluster'] = kmeans_final.labels_

# Analyzing the clusters
cluster_summary = data.groupby('Cluster').agg({
    'Category': lambda x: x.mode()[0],  # Most common category in each cluster
    'Workplace': lambda x: x.mode()[0],  # Most common workplace in each cluster
    'Type': lambda x: x.mode()[0],       # Most common employment type in each cluster
    'Location': 'count'                  # Number of listings in each cluster
}).rename(columns={'Location': 'Count'})

cluster_summary

In [ ]:
# Visualizing the distribution of job categories within each cluster
plt.figure(figsize=(14, 8))
sns.countplot(data=data, x='Category', hue='Cluster', palette='Set1')
plt.title('Distribution of Job Categories Within Clusters')
plt.xlabel('Job Category')
plt.ylabel('Number of Listings')
plt.legend(title='Cluster')
plt.xticks(rotation=45)
plt.show()

# Visualizing workplace preferences within each cluster
plt.figure(figsize=(14, 8))
sns.countplot(data=data, x='Workplace', hue='Cluster', palette='Set2')
plt.title('Workplace Preferences Within Clusters')
plt.xlabel('Workplace Preference')
plt.ylabel('Number of Listings')
plt.legend(title='Cluster')
plt.xticks(rotation=45)
plt.show()

# Visualizing employment types within each cluster
plt.figure(figsize=(14, 8))
sns.countplot(data=data, x='Type', hue='Cluster', palette='Set3')
plt.title('Employment Types Within Clusters')
plt.xlabel('Employment Type')
plt.ylabel('Number of Listings')
plt.legend(title='Cluster')
plt.xticks(rotation=45)
plt.show()

In [ ]:
from scipy.stats import chi2_contingency
import numpy as np

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2_corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    r_corr = r - ((r-1)**2)/(n-1)
    k_corr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2_corr / min((k_corr-1), (r_corr-1)))

# Creating a matrix to hold the Cramér's V values
variables = ['Category', 'Workplace', 'Type']
cramers_v_matrix = pd.DataFrame(np.zeros((len(variables), len(variables))), index=variables, columns=variables)

# Calculating Cramér's V for each pair of variables
for col1 in variables:
    for col2 in variables:
        cramers_v_matrix.loc[col1, col2] = cramers_v(data[col1], data[col2])

cramers_v_matrix
# Visualizing the Cramér's V correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cramers_v_matrix, annot=True, cmap='coolwarm', vmin=0, vmax=1)
plt.title("Cramér's V Correlation Matrix")
plt.show()