In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
posting = pd.read_csv('postings.csv', encoding='utf-8')
posting

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1.713398e+12,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1.712858e+12,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1.713278e+12,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1.712896e+12,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1.713452e+12,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123844,3906267117,Lozano Smith,Title IX/Investigations Attorney,Our Walnut Creek office is currently seeking a...,195000.0,YEARLY,"Walnut Creek, CA",56120.0,1.0,,...,,1.713571e+12,,0,FULL_TIME,USD,BASE_SALARY,157500.0,94595.0,6013.0
123845,3906267126,Pinterest,"Staff Software Engineer, ML Serving Platform",About Pinterest:\n\nMillions of people across ...,,,United States,1124131.0,3.0,,...,,1.713572e+12,www.pinterestcareers.com,0,FULL_TIME,,,,,
123846,3906267131,EPS Learning,"Account Executive, Oregon/Washington",Company Overview\n\nEPS Learning is a leading ...,,,"Spokane, WA",90552133.0,3.0,,...,,1.713572e+12,epsoperations.bamboohr.com,0,FULL_TIME,,,,99201.0,53063.0
123847,3906267195,Trelleborg Applied Technologies,Business Development Manager,The Business Development Manager is a 'hunter'...,,,"Texas, United States",2793699.0,4.0,,...,,1.713573e+12,,0,FULL_TIME,,,,,


In [3]:
posting['description'] = posting['description'].str.split(r'\s*,\s*')
posting = posting.explode('description')

In [4]:
duplicates = posting['job_id'].duplicated()
num_duplicates = duplicates.sum()
num_duplicates

np.int64(4763082)

In [5]:
# Hàm tiền xử lý văn bản
def clean_text(text):
    if pd.isna(text): return ""  # Xử lý giá trị NaN
    text = text.lower()  # Chuyển thành chữ thường
    text = re.sub(r'\W+', ' ', text)  # Loại bỏ ký tự đặc biệt
    text = re.sub(r'\s+', ' ', text).strip()  # Xóa khoảng trắng thừa
    return text

# Áp dụng tiền xử lý lên cột title và description
posting['title'] = posting['title'].apply(clean_text)
posting['description'] = posting['description'].apply(clean_text)

In [None]:
# Từ khóa IT để gán nhãn
it_keywords = ['software', 'developer', 'data',
               'python', 'java', 'backend', 'frontend', 'fullstack', 
               'cybersecurity', 'cloud', 'devops', 'computer', 
               'system administrator', 'programmer', 'web', 'network']

# Hàm gán nhãn IT (1) hoặc Non-IT (0)
def label_it_job(text):
    return 1 if any(keyword in text for keyword in it_keywords) else 0

# Gán nhãn vào cột mới
posting['is_it'] = posting['title'].apply(label_it_job)

# Kiểm tra số lượng bài đăng IT và Non-IT
print(posting['title'].value_counts())

is_it
0    4626296
1     260635
Name: count, dtype: int64


In [7]:
# Sử dụng TF-Iposting để trích xuất đặc trưng
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(posting['description'])

# Lấy nhãn
y = posting['is_it']

In [8]:
# Chia dữ liệu thành tập huấn luyện (80%) và kiểm tra (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Khởi tạo mô hình Logistic Regression
model = LogisticRegression()
# model = RandomForestClassifier(n_estimators=100)  # Thử Random Forest nếu muốn

# Huấn luyện mô hình
model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test)

In [9]:
duplicates = posting['description'].duplicated()
num_duplicates = duplicates.sum()
num_duplicates

np.int64(3279898)

In [10]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9963842367455266
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    925252
           1       1.00      0.93      0.96     52135

    accuracy                           1.00    977387
   macro avg       1.00      0.97      0.98    977387
weighted avg       1.00      1.00      1.00    977387



In [11]:
# Dự đoán trên toàn bộ dữ liệu
posting['is_it_predicted'] = model.predict(X)

# Lọc các bài đăng IT
posting_it = posting[posting['is_it_predicted'] == 1]

# Xuất file mới chứa danh sách việc làm IT
posting_it.to_csv("filtered_it_jobs.csv", index=False, encoding='utf-8')
print("Lưu danh sách việc làm IT vào filtered_it_jobs.csv")

Lưu danh sách việc làm IT vào filtered_it_jobs.csv
