In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [1]:
posting = pd.read_csv('postings.csv', encoding='utf-8')
posting

NameError: name 'pd' is not defined

In [None]:
duplicates = posting['job_id'].duplicated()
num_duplicates = duplicates.sum()
num_duplicates

np.int64(0)

In [None]:
# Hàm tiền xử lý văn bản
def clean_text(text):
    if pd.isna(text): return ""  # Xử lý giá trị NaN
    text = text.lower()  # Chuyển thành chữ thường
    text = re.sub(r'\W+', ' ', text)  # Loại bỏ ký tự đặc biệt
    text = re.sub(r'\s+', ' ', text).strip()  # Xóa khoảng trắng thừa
    return text

# Áp dụng tiền xử lý lên cột title và description
posting['title'] = posting['title'].apply(clean_text)

In [None]:
# Từ khóa IT để gán nhãn
it_keywords = ['python', 'java', 'c++', 'c#', 'javascript', 'typescript', 'html', 'css', 'react', 'angular',
    'vue', 'nodejs', 'express', 'django', 'flask', 'sql', 'mysql', 'postgresql', 'mongodb',
    'git', 'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'tensorflow', 'pytorch', 'machine learning',
    'deep learning', 'ai', 'nlp', 'computer vision', 'data science', 'big data', 'hadoop', 'spark',
    'linux', 'bash', 'shell scripting', 'api', 'rest', 'graphql', 'cybersecurity', 'penetration testing',
    'blockchain', 'smart contracts', 'devops', 'agile', 'scrum', 'software development', 'backend',
    'frontend', 'fullstack', 'data analysis', 'cloud computing', 'networking', 'system administration']

# Hàm gán nhãn IT (1) hoặc Non-IT (0)
def label_it_job(text):
    return 1 if any(keyword in text for keyword in it_keywords) else 0

# Gán nhãn vào cột mới
posting['is_it'] = posting['title'].apply(label_it_job)

# Kiểm tra số lượng bài đăng IT và Non-IT
print(posting['is_it'].value_counts())

is_it
0    110299
1     13550
Name: count, dtype: int64


In [None]:
# Sử dụng TF-Iposting để trích xuất đặc trưng
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(posting['title'])

# Lấy nhãn
y = posting['is_it']

In [None]:
# Chia dữ liệu thành tập huấn luyện (80%) và kiểm tra (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Khởi tạo mô hình Logistic Regression
model = LogisticRegression()
# model = RandomForestClassifier(n_estimators=100)  # Thử Random Forest nếu muốn

# Huấn luyện mô hình
model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9870811465482439
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     22003
           1       1.00      0.89      0.94      2767

    accuracy                           0.99     24770
   macro avg       0.99      0.94      0.97     24770
weighted avg       0.99      0.99      0.99     24770



In [None]:
# Dự đoán trên toàn bộ dữ liệu
posting['is_it_predicted'] = model.predict(X)

# Lọc các bài đăng IT
posting_it = posting[posting['is_it_predicted'] == 1]

In [None]:
# Xuất file mới chứa danh sách việc làm IT
posting_it.to_csv("filtered_it_jobs.csv", index=False, encoding='utf-8')
print("Lưu danh sách việc làm IT vào filtered_it_jobs.csv")

Lưu danh sách việc làm IT vào filtered_it_jobs.csv


In [None]:
skill_data = pd.read_csv('filtered_it_jobs.csv', encoding='utf-8')

In [None]:
skill_data['description'] = skill_data['description'].str.split(r'\s*[,;.]\s*')
skill_data = skill_data.explode('description')
skill_data

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips,is_it,is_it_predicted
0,1829192,,mental health therapist counselor,At Aspen Therapy and Wellness,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
0,1829192,,mental health therapist counselor,we are committed to serving clients with best ...,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
0,1829192,,mental health therapist counselor,improvements and better quality of life,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
0,1829192,,mental health therapist counselor,We believe in providing a secure,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
0,1829192,,mental health therapist counselor,supportive environment to grow as a clinician ...,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12070,3906266217,The Dyrt,senior frontend app developer,is venture-backed,,,United States,6404239.0,1.0,,...,,0,FULL_TIME,,,,,,1,1
12070,3906266217,The Dyrt,senior frontend app developer,and has 27 employees working virtually around ...,,,United States,6404239.0,1.0,,...,,0,FULL_TIME,,,,,,1,1
12070,3906266217,The Dyrt,senior frontend app developer,S,,,United States,6404239.0,1.0,,...,,0,FULL_TIME,,,,,,1,1
12070,3906266217,The Dyrt,senior frontend app developer,Interested candidates should submit a cover le...,,,United States,6404239.0,1.0,,...,,0,FULL_TIME,,,,,,1,1


In [None]:
def contains_it_keywords(desc):
    return any(keyword in desc for keyword in it_keywords)
skill_data = skill_data[skill_data['description'].apply(contains_it_keywords)]
skill_data

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips,is_it,is_it_predicted
0,1829192,,mental health therapist counselor,Our supervisors are trained in EMDR and utiliz...,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
0,1829192,,mental health therapist counselor,We are actively looking to hire a therapist in...,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
0,1829192,,mental health therapist counselor,Position Requirement Possibilities:A graduate ...,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
0,1829192,,mental health therapist counselor,Building and maintaining an active caseload wi...,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
0,1829192,,mental health therapist counselor,Maintaining all clinical documentation in acco...,50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12070,3906266217,The Dyrt,senior frontend app developer,and automationHave strong writing and communic...,,,United States,6404239.0,1.0,,...,,0,FULL_TIME,,,,,,1,1
12070,3906266217,The Dyrt,senior frontend app developer,Sentry or Crashlytics: Experience debugging us...,,,United States,6404239.0,1.0,,...,,0,FULL_TIME,,,,,,1,1
12070,3906266217,The Dyrt,senior frontend app developer,and ORMCan do performance tuning of app and da...,,,United States,6404239.0,1.0,,...,,0,FULL_TIME,,,,,,1,1
12070,3906266217,The Dyrt,senior frontend app developer,and celebrates all people interested in camping,,,United States,6404239.0,1.0,,...,,0,FULL_TIME,,,,,,1,1


In [None]:
# Hàm làm sạch: Xóa các ký tự đặc biệt, chỉ giữ lại chữ cái, số và khoảng trắng
def clean_description(text):
    if isinstance(text, str):
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Xóa mọi thứ trừ chữ cái, số và khoảng trắng
    return text  # Nếu không phải chuỗi, giữ nguyên

# Áp dụng hàm làm sạch vào cột 'description'
skill_data['description'] = skill_data['description'].apply(clean_description)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skill_data['description'] = skill_data['description'].apply(clean_description)


In [None]:
choosen_column = [
    'job_id',
    'company_name',
    'title',
    'description',
    'max_salary',
]
skill_data = skill_data[choosen_column]
skill_data

Unnamed: 0,job_id,company_name,title,description,max_salary
0,1829192,,mental health therapist counselor,Our supervisors are trained in EMDR and utiliz...,50.0
0,1829192,,mental health therapist counselor,We are actively looking to hire a therapist in...,50.0
0,1829192,,mental health therapist counselor,Position Requirement PossibilitiesA graduate l...,50.0
0,1829192,,mental health therapist counselor,Building and maintaining an active caseload wi...,50.0
0,1829192,,mental health therapist counselor,Maintaining all clinical documentation in acco...,50.0
...,...,...,...,...,...
12070,3906266217,The Dyrt,senior frontend app developer,and automationHave strong writing and communic...,
12070,3906266217,The Dyrt,senior frontend app developer,Sentry or Crashlytics Experience debugging usi...,
12070,3906266217,The Dyrt,senior frontend app developer,and ORMCan do performance tuning of app and da...,
12070,3906266217,The Dyrt,senior frontend app developer,and celebrates all people interested in camping,


In [None]:
skill_data.to_csv('filtered_it_skills.csv', index=False)