In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import mysql.connector



In [2]:
df_train = pd.read_csv('C:/Python/Marketing-Job-Data-Scraping/Skill Classification/training_data.csv')
df_train

Unnamed: 0,skill_name,skill_group
0,Sales Pitching,Sales
1,Negotiation,Sales
2,Relationship Building,Sales
3,Closing Deals,Sales
4,Customer Relationship Management (CRM),Sales
...,...,...
1798,Audio Conferencing,Graphic Design/Visual
1799,Outdoor Advertising,Sales
1800,Negotiations,Sales
1801,Mobile App Development,Technology


In [3]:
# Feature extraction - Convert skills into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(df_train['skill_name'])

# Target labels
y_train = df_train['skill_group']

# Train a Support Vector Machine (SVM) classifier
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

In [4]:
# Test: Categorize new skills
new_skills = [
    'Networking', 'Financial Analysis', 'Mobile App Development', 'Marketing Strategy',
    'Data Visualization', 'Creative Writing', 'Project Planning', 'Digital Marketing', 'Data Science'
]
X_new_skills = tfidf_vectorizer.transform(new_skills)
predicted_categories = classifier.predict(X_new_skills)

for skill, category in zip(new_skills, predicted_categories):
    print(f"Skill: {skill} --> Category: {category}")

Skill: Networking --> Category: Communication
Skill: Financial Analysis --> Category: Quantitative/Analytics
Skill: Mobile App Development --> Category: Technology
Skill: Marketing Strategy --> Category: Strategy/Planning/Management
Skill: Data Visualization --> Category: Quantitative/Analytics
Skill: Creative Writing --> Category: Content/Writing
Skill: Project Planning --> Category: Strategy/Planning/Management
Skill: Digital Marketing --> Category: Graphic Design/Visual
Skill: Data Science --> Category: Quantitative/Analytics


In [5]:
# Apply on our data
df = pd.read_csv('C:/Python/Marketing-Job-Data-Scraping/Skill Classification/marketing_skill_linkedin.csv')

new_skills = df['skill_name']

X1 = tfidf_vectorizer.transform(new_skills)

predicted_categories = classifier.predict(X1)
# accuracy = accuracy_score(Y1, predicted_categories)
# print(f"Accuracy: {accuracy:.2f}")

In [6]:
df['skill_group'] = predicted_categories
df.to_csv('C:/Python/Marketing-Job-Data-Scraping/Skill Classification/marketing_skill_linkedin.csv', index=False)

In [7]:
df

Unnamed: 0,skill_id,skill_name,appearance_in_description,appearance_in_skill,skill_group
0,1,Ad Serving,1,8,Social Media/Digital
1,2,Campaigns,262,71,Strategy/Planning/Management
2,3,Digital Marketing,83,42,Graphic Design/Visual
3,4,Digital Media,12,3,Graphic Design/Visual
4,5,Marketing,597,192,Strategy/Planning/Management
...,...,...,...,...,...
750,2477,Americans with Disabilities Act,5,1,Other
751,2484,Sales Contracts,0,1,Sales
752,2485,Sales Negotiation,0,1,Sales
753,2489,Management,486,82,Strategy/Planning/Management


In [8]:
# Connect to MySQL database
mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
    database="job_scraping"
)

cursor = mydb.cursor()

for index, row in df.iterrows():
    skill_id = row["skill_id"]
    skill_group = row["skill_group"]
    
    update_query = "UPDATE skills SET skill_group = %s WHERE skill_id = %s"
    update_values = (skill_group, skill_id)
    cursor.execute(update_query, update_values)

mydb.commit()
cursor.close()
mydb.close()