In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer

In [2]:
data=pd.read_excel("career_pred.xlsx")

In [3]:
def convert_to_int(remain_df, col_name):
    new_df = remain_df.copy()  # create a copy of the original dataframe
    new_df[col_name] = pd.to_numeric(new_df[col_name], errors='coerce').astype('Int64')  # convert the specified column to int64
    return new_df

In [4]:
df = convert_to_int(data, 'learning_style')
print(df.dtypes)

sslc                     int64
hsc                      int64
cgpa                     int64
school_type              int64
no_of_miniprojects       int64
no_of_projects           int64
coresub_skill            int64
aptitude_skill           int64
problemsolving_skill     int64
programming_skill        int64
abstractthink_skill      int64
design_skill             int64
first_computer           int64
first_program            int64
lab_programs             int64
ds_coding                int64
technology_used          int64
sympos_attend            int64
sympos_won               int64
extracurricular          int64
learning_style           Int64
college_bench            int64
clg_teachers_know        int64
college_performence      int64
college_skills           int64
ROLE                    object
dtype: object


In [5]:
labeled_data = df[:50]
unlabeled_data = df[50:]


In [6]:
unlabeled_data = unlabeled_data.drop('ROLE', axis=1)
unlabeled_data

Unnamed: 0,sslc,hsc,cgpa,school_type,no_of_miniprojects,no_of_projects,coresub_skill,aptitude_skill,problemsolving_skill,programming_skill,...,ds_coding,technology_used,sympos_attend,sympos_won,extracurricular,learning_style,college_bench,clg_teachers_know,college_performence,college_skills
50,3,4,1,1,3,1,1,1,3,3,...,1,1,4,1,1,2,5,3,3,4
51,4,4,2,1,1,1,2,2,2,2,...,1,1,3,5,3,4,4,3,3,5
52,4,4,4,1,2,1,1,2,3,3,...,3,1,3,3,1,1,3,4,4,3
53,3,4,1,2,1,1,2,2,4,3,...,4,1,1,2,1,2,3,4,3,1
54,1,3,1,2,1,1,2,1,2,2,...,1,1,1,2,4,2,3,4,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7520,3,2,1,2,1,1,2,2,3,3,...,4,1,4,2,2,1,4,4,3,3
7521,2,3,3,3,4,3,2,1,3,3,...,3,4,2,3,2,4,1,3,3,2
7522,3,2,1,2,1,1,1,2,2,3,...,2,1,3,1,1,3,5,3,3,4
7523,2,1,1,1,2,1,1,2,2,2,...,2,1,3,2,1,2,4,4,3,4


In [7]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(unlabeled_data)

In [8]:
scaler = StandardScaler()
unlabeled_data = scaler.fit_transform(X_imputed)
unlabeled_data

array([[ 0.10176264,  1.06459192, -1.41601526, ..., -1.05834323,
        -0.66834868,  0.7135949 ],
       [ 1.16713477,  1.06459192, -0.17002184, ..., -1.05834323,
        -0.66834868,  1.42310832],
       [ 1.16713477,  1.06459192,  2.32196499, ...,  0.71068979,
         1.24432604,  0.00408148],
       ...,
       [ 0.10176264, -1.15702299, -1.41601526, ..., -1.05834323,
        -0.66834868,  0.7135949 ],
       [-0.9636095 , -2.26783045, -1.41601526, ...,  0.71068979,
        -0.66834868,  0.7135949 ],
       [-0.9636095 , -1.15702299, -0.17002184, ..., -1.05834323,
        -0.66834868,  1.42310832]])

In [10]:
scores = []
eps_range = np.arange(2.0,2.1)

for eps in eps_range:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    dbscan.fit(unlabeled_data)
    score = silhouette_score(unlabeled_data, dbscan.labels_)
    scores.append(score)

best_eps_index = np.argmax(scores)
best_eps = eps_range[best_eps_index]


In [11]:
dbscan = DBSCAN(eps=best_eps, min_samples=5)
dbscan.fit(unlabeled_data)


In [12]:
labeled_data

Unnamed: 0,sslc,hsc,cgpa,school_type,no_of_miniprojects,no_of_projects,coresub_skill,aptitude_skill,problemsolving_skill,programming_skill,...,technology_used,sympos_attend,sympos_won,extracurricular,learning_style,college_bench,clg_teachers_know,college_performence,college_skills,ROLE
0,2,3,2,3,1,1,1,1,2,4,...,1,3,1,2,2,4,3,4,5,Technical Support
1,3,1,2,2,2,1,1,3,2,4,...,2,3,2,3,2,5,3,3,4,Software Developer
2,3,4,2,2,1,1,1,2,3,3,...,1,4,3,1,4,3,3,3,4,UI/UX Designer
3,3,4,2,1,1,1,1,1,3,2,...,1,3,2,1,2,1,4,4,3,Technical Support
4,2,3,2,4,1,1,1,4,3,3,...,1,2,1,3,1,5,4,3,1,Data Analyst
5,1,1,1,1,1,1,2,2,3,2,...,1,3,2,1,2,5,3,3,5,Technical Writer
6,3,3,2,2,2,1,2,4,3,3,...,1,3,2,1,1,5,4,4,4,Web Developer
7,3,3,2,2,1,1,2,1,2,2,...,1,3,3,3,4,5,4,3,3,Technical Support
8,3,2,3,3,1,1,1,2,3,5,...,1,3,3,3,2,5,4,3,3,Technical Writer
9,4,4,2,2,1,1,2,2,3,3,...,1,1,2,2,3,5,4,4,5,Software Tester


In [13]:
cluster_labels = {}
for label in np.unique(dbscan.labels_):
    if label != -1:
        cluster_data = unlabeled_data[dbscan.labels_ == label]
        closest_labeled_data = labeled_data.drop('ROLE', axis=1).iloc[np.argmin(np.linalg.norm(cluster_data - labeled_data.drop('ROLE', axis=1).values, axis=1))]
        cluster_labels[label] = closest_labeled_data['ROLE']


ValueError: operands could not be broadcast together with shapes (22,25) (50,25) 

In [14]:
labeled_data['cluster'] = -1
for label, role in cluster_labels.items():
    labeled_data.loc[dbscan.labels_ == label, 'cluster'] = role


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['cluster'] = -1


In [15]:
labeled_data

Unnamed: 0,sslc,hsc,cgpa,school_type,no_of_miniprojects,no_of_projects,coresub_skill,aptitude_skill,problemsolving_skill,programming_skill,...,sympos_attend,sympos_won,extracurricular,learning_style,college_bench,clg_teachers_know,college_performence,college_skills,ROLE,cluster
0,2,3,2,3,1,1,1,1,2,4,...,3,1,2,2,4,3,4,5,Technical Support,-1
1,3,1,2,2,2,1,1,3,2,4,...,3,2,3,2,5,3,3,4,Software Developer,-1
2,3,4,2,2,1,1,1,2,3,3,...,4,3,1,4,3,3,3,4,UI/UX Designer,-1
3,3,4,2,1,1,1,1,1,3,2,...,3,2,1,2,1,4,4,3,Technical Support,-1
4,2,3,2,4,1,1,1,4,3,3,...,2,1,3,1,5,4,3,1,Data Analyst,-1
5,1,1,1,1,1,1,2,2,3,2,...,3,2,1,2,5,3,3,5,Technical Writer,-1
6,3,3,2,2,2,1,2,4,3,3,...,3,2,1,1,5,4,4,4,Web Developer,-1
7,3,3,2,2,1,1,2,1,2,2,...,3,3,3,4,5,4,3,3,Technical Support,-1
8,3,2,3,3,1,1,1,2,3,5,...,3,3,3,2,5,4,3,3,Technical Writer,-1
9,4,4,2,2,1,1,2,2,3,3,...,1,2,2,3,5,4,4,5,Software Tester,-1
