<a href="https://colab.research.google.com/github/lukeeees/NLP-Project/blob/main/skills.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec #Word2Vec is mostly used for huge datasets

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df_train= pd.read_csv('skills.csv')

In [4]:
df_train.head()

Unnamed: 0,ID,Name
0,nextjs,NextJs
1,writer,Writer
2,global_market_analysis,Global Market Analysis
3,crm,CRM
4,typescript,TypeScript


In [5]:
df_train.shape

(27721, 2)

In [8]:
df_train.describe()

Unnamed: 0,ID,Name
count,27721,27721
unique,27721,27721
top,nextjs,NextJs
freq,1,1


In [9]:

df_train['clean_text_tok']=[nltk.word_tokenize(i) for i in df_train['Name']] #convert preprocessed sentence to tokenized sentence

In [10]:
df_train.head(10)

Unnamed: 0,ID,Name,clean_text_tok
0,nextjs,NextJs,[NextJs]
1,writer,Writer,[Writer]
2,global_market_analysis,Global Market Analysis,"[Global, Market, Analysis]"
3,crm,CRM,[CRM]
4,typescript,TypeScript,[TypeScript]
5,business_operations,Business Operations,"[Business, Operations]"
6,budget_management,Budget Management,"[Budget, Management]"
7,front_end_developer,Front End Developer,"[Front, End, Developer]"
8,back_end_developer,Back End Developer,"[Back, End, Developer]"
9,ui_ux_designer,UI/UX Designer,"[UI/UX, Designer]"


In [31]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the list of tokens to a string
df_train['clean_text_str'] = df_train['clean_text_tok'].apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train['clean_text_str'])

# Perform K-Means clustering
num_clusters = 10  # You can adjust this to your desired number of clusters
kmeans = KMeans(n_clusters=num_clusters)
df_train['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Create a dictionary to map cluster labels to cluster names
cluster_mapping = {i: f'Cluster {i}' for i in range(num_clusters)}

# Add a new column 'cluster_name' to the original DataFrame
df_train['cluster_name'] = df_train['cluster'].map(cluster_mapping)

# Print the DataFrame with cluster information
df_train



Unnamed: 0,ID,Name,clean_text_tok,word_vectors,clean_text_str,cluster,cluster_name
0,nextjs,NextJs,[NextJs],"[[0.00013693214, -0.009278379, 0.006605464, -0...",NextJs,0,Cluster 0
1,writer,Writer,[Writer],"[[-0.07784393, 0.138686, 0.112783454, 0.044111...",Writer,0,Cluster 0
2,global_market_analysis,Global Market Analysis,"[Global, Market, Analysis]","[[-0.12882166, 0.23406425, 0.16847256, 0.07086...",Global Market Analysis,5,Cluster 5
3,crm,CRM,[CRM],"[[-0.006000885, -0.0017096949, 0.009852063, 0....",CRM,0,Cluster 0
4,typescript,TypeScript,[TypeScript],"[[0.0068701827, -0.00517951, 0.006524663, 0.00...",TypeScript,0,Cluster 0
...,...,...,...,...,...,...,...
27716,ckful8q9io6s71cutgng,Heavy Startup Experiences,"[Heavy, Startup, Experiences]","[[-0.010662459, 0.028743176, 0.010143599, 0.01...",Heavy Startup Experiences,0,Cluster 0
27717,ckful8q9io6s71cutgp0,Net/C,[Net/C],"[[0.006260116, -0.0044959653, -0.009769324, -0...",Net/C,0,Cluster 0
27718,ckful8q9io6s71cutgq0,Dev Team Building Up,"[Dev, Team, Building, Up]","[[-0.020172419, 0.04206197, 0.049786653, 0.010...",Dev Team Building Up,0,Cluster 0
27719,ckful8q9io6s71cutgqg,High Performance System Development,"[High, Performance, System, Development]","[[-0.10343271, 0.20939901, 0.15569292, 0.06437...",High Performance System Development,1,Cluster 1




In [37]:
df_train.groupby('cluster_name')['clean_text_str'].apply(lambda x: '\n'.join(x))

cluster_name
Cluster 0    NextJs\nWriter\nCRM\nTypeScript\nFront End Dev...
Cluster 1    Professional Development Coach\nStrategy\nBran...
Cluster 2    Business Operations\nBusiness Intelligence\nTe...
Cluster 3    Product And Digital Design\nFraud Prevention A...
Cluster 4    Financial Reporting\nSocial Media Reporting\nR...
Cluster 5    Global Market Analysis\nMarket Research\nGrowt...
Cluster 6    Customer Research\nCustomer Success\nWork Ethi...
Cluster 7    Budget Management\nBrand Development & Managem...
Cluster 8    Marketing Effectiveness\nSocial Media Marketin...
Cluster 9    Gtm Planning\nFinancial Analysis Or Planning\n...
Name: clean_text_str, dtype: object

In [41]:
df_train.groupby('cluster_name').size()

cluster_name
Cluster 0    19031
Cluster 1     1195
Cluster 2      875
Cluster 3     2593
Cluster 4      186
Cluster 5      424
Cluster 6      513
Cluster 7     1506
Cluster 8      884
Cluster 9      514
dtype: int64

In [42]:
df_train.to_csv('skills2.csv', index=False)