In [91]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [92]:
data = pd.read_csv('resume_data.csv', encoding='utf-8')
data

Unnamed: 0,address,career_objective,skills,educational_institution_name,degree_names,passing_years,educational_results,result_types,major_field_of_studies,professional_company_names,...,online_links,issue_dates,expiry_dates,job_position_name,educationaL_requirements,experiencere_requirement,age_requirement,responsibilities.1,skills_required,matched_score
0,,Big data analytics working and database wareho...,"['Big Data', 'Hadoop', 'Hive', 'Python', 'Mapr...",['The Amity School of Engineering & Technology...,['B.Tech'],['2019'],['N/A'],[None],['Electronics'],['Coca-COla'],...,,,,Senior Software Engineer,B.Sc in Computer Science & Engineering from a ...,At least 1 year,,Technical Support\nTroubleshooting\nCollaborat...,,0.850000
1,,Fresher looking to join as a data analyst and ...,"['Data Analysis', 'Data Analytics', 'Business ...","['Delhi University - Hansraj College', 'Delhi ...","['B.Sc (Maths)', 'M.Sc (Science) (Statistics)']","['2015', '2018']","['N/A', 'N/A']","['N/A', 'N/A']","['Mathematics', 'Statistics']",['BIB Consultancy'],...,,,,Machine Learning (ML) Engineer,M.Sc in Computer Science & Engineering or in a...,At least 5 year(s),,Machine Learning Leadership\nCross-Functional ...,,0.750000
2,,,"['Software Development', 'Machine Learning', '...","['Birla Institute of Technology (BIT), Ranchi']",['B.Tech'],['2018'],['N/A'],['N/A'],['Electronics/Telecommunication'],['Axis Bank Limited'],...,,,,"Executive/ Senior Executive- Trade Marketing, ...",Master of Business Administration (MBA),At least 3 years,,"Trade Marketing Executive\nBrand Visibility, S...",Brand Promotion\nCampaign Management\nField Su...,0.416667
3,,To obtain a position in a fast-paced business ...,"['accounts payables', 'accounts receivables', ...","['Martinez Adult Education, Business Training ...",['Computer Applications Specialist Certificate...,['2008'],[None],[None],['Computer Applications'],"['Company Name ï¼ City , State', 'Company Name...",...,,,,Business Development Executive,Bachelor/Honors,1 to 3 years,Age 22 to 30 years,Apparel Sourcing\nQuality Garment Sourcing\nRe...,Fast typing skill\nIELTSInternet browsing & on...,0.760000
4,,Professional accountant with an outstanding wo...,"['Analytical reasoning', 'Compliance testing k...",['Kent State University'],['Bachelor of Business Administration'],[None],['3.84'],[None],['Accounting'],"['Company Name', 'Company Name', 'Company Name...",...,[None],[None],"['February 15, 2021']",Senior iOS Engineer,Bachelor of Science (BSc) in Computer Science,At least 4 years,,iOS Lifecycle\nRequirement Analysis\nNative Fr...,iOS\niOS App Developer\niOS Application Develo...,0.650000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9539,,,"['Mathematical modelling', 'Machine Learning',...",['Sanghvi College of Engineering'],['B.Tech'],['2019'],['N/A'],['N/A'],['N/A'],['BPM Foundation'],...,,,,Data Engineer,Bachelor of Science (BSc),5 to 8 years,,Data Platform Design\nData Pipeline Developmen...,Azure\nBig Data\nData Analytics\nETL Tools\nPo...,0.683333
9540,,Expertise EDA modeler. I like to learn what my...,"['Data Analysis', 'Business Analysis', 'Machin...","['KVoCT, Pune', 'KVoCT, Pune']","['B.CA', 'M.CA']","['2018', '2020']","[None, None]","[None, None]","[None, None]",['Passionate Solution'],...,,,,Executive/ Sr. Executive -IT,Bachelor of Science (BSc) in Computer Science ...,3 to 5 years,Age at most 40 years,Hardware & Software Installation\nSystem Monit...,,0.650000
9541,,Looking for roles related to application devel...,"['Business Analyst', 'Data Analytics', 'Data C...",['PGG College Mysore'],['B.BA'],['2019'],['N/A'],['N/A'],['N/A'],['ZigSAW'],...,,,,Executive - VAT,BBA in Accounting and Finance,1 to 3 years,,Mushak Forms Maintenance\nVAT Software & MS Of...,VAT and Tax,0.650000
9542,,,"['Machine Learning', 'Natural Language Process...","['Rajiv Gandhi Memorial University, Delhi']",['B.TECH'],['2020'],['N/A'],['N/A'],['Electrical'],['Zynta Labs'],...,[None],[None],[None],Asst. Manager/ Manger (Administrative),Bachelor/Honors,At least 5 years,Age at least 28 years,Administrative Support\nScheduling\nFiling & D...,•Administration\n•Health Safety and Environmen...,0.650000


In [93]:
# Hàm tiền xử lý văn bản
def clean_words(text):
    if pd.isna(text): return ""  # Xử lý giá trị NaN
    text = text.lower()
    text = re.sub(r'[\[\]]', '', str(text))
    # Loại bỏ dấu ' nếu có
    text = re.sub(r"'", '', text)  # Chuyển thành chữ thường
    return text

# Áp dụng tiền xử lý lên cột title và description
data['skills'] = data['skills'].apply(clean_words)
data['positions'] = data['positions'].apply(clean_words)
data['job_position_name'] = data['job_position_name'].apply(clean_words)
data['major_field_of_studies'] = data['major_field_of_studies'].apply(clean_words)
data['career_objective'] = data['career_objective'].apply(clean_words)
data['responsibilities'] = data['responsibilities'].apply(clean_words)
data['related_skils_in_job'] = data['related_skils_in_job'].apply(clean_words)

In [94]:
def extract_years(experience_str):
    """Hàm trích xuất số năm từ chuỗi kinh nghiệm"""
    if pd.isna(experience_str) or not isinstance(experience_str, str):
        return None
    
    # Tìm tất cả các số trong chuỗi
    numbers = re.findall(r'\d+\.?\d*', experience_str)
    numbers = [float(num) for num in numbers]
    
    if not numbers:
        return None
    
    # Nếu có dạng "X to Y years" -> lấy trung bình
    if 'to' in experience_str.lower() and len(numbers) >= 2:
        return (numbers[0] + numbers[1]) / 2
    # Nếu có dạng "At least X years" -> lấy X
    elif 'at least' in experience_str.lower():
        return numbers[0]
    # Mặc định lấy số đầu tiên
    else:
        return numbers[0]

def determine_level(experience_str):
    """Hàm phân loại cấp độ dựa trên chuỗi kinh nghiệm"""
    years = extract_years(experience_str)
    
    if years is None:
        return "Unknown"
    
    if years < 1: 
        return "Fresher/Intern"
    elif 1 <= years < 3: 
        return "Junior"
    elif 3 <= years < 5: 
        return "Middle"
    elif 5 <= years < 8: 
        return "Senior"
    else: 
        return "Lead/Architect"

# Áp dụng cho cột kinh nghiệm trong DataFrame
data['experiencere_requirement'] = data['experiencere_requirement'].apply(determine_level)

In [None]:
data.rename(columns={
    'experiencere_requirement': 'experience_requirement'
    }, inplace=True)
data['job_position_name'] = data['job_position_name'].str.replace(r'[^\w\s]', ',', regex=True)

In [96]:
# Hiển thị phân bổ cấp độ
print(data['experience_requirement'].value_counts())

experience_requirement
Senior            2727
Middle            2725
Junior            2387
Unknown           1364
Lead/Architect     341
Name: count, dtype: int64


In [97]:
# Từ khóa IT để gán nhãn
it_keywords = ['python', 'java', 'c++', 'c#', 'javascript', 'typescript', 'html', 'css', 'react', 'angular',
    'vue', 'nodejs', 'express', 'django', 'flask', 'sql', 'mysql', 'postgresql', 'mongodb',
    'git', 'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'tensorflow', 'pytorch', 'machine learning',
    'deep learning', 'ai', 'nlp', 'computer vision', 'data science', 'big data', 'hadoop', 'spark',
    'linux', 'bash', 'shell scripting', 'api', 'rest', 'graphql', 'cybersecurity', 'penetration testing',
    'blockchain', 'smart contracts', 'devops', 'agile', 'scrum', 'software development', 'backend',
    'frontend', 'fullstack', 'data analysis', 'cloud computing', 'networking', 'system administration']


In [98]:
def contains_it_keyword(text):
    if pd.isnull(text): return False
    text = str(text).lower()
    return any(kw in text for kw in it_keywords)

# Áp dụng lọc với 3 cột
filtered_data = data[
    data['skills'].apply(contains_it_keyword) |
    data['related_skils_in_job'].apply(contains_it_keyword) |
    data['career_objective'].apply(contains_it_keyword)
]


In [99]:
# Gán nhãn IT (1) hoặc Non-IT (0) cho từng dòng
filtered_data['is_it'] = 1  # Tất cả các dòng trong filtered_data đều là IT

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['is_it'] = 1  # Tất cả các dòng trong filtered_data đều là IT


In [100]:
data['skills'] = data['skills'].fillna("")
data['related_skils_in_job'] = data['related_skils_in_job'].fillna("")
data['career_objective'] = data['career_objective'].fillna("")

# Kết hợp ba cột thành một để huấn luyện mô hình
data['combined_text'] = data['career_objective'] + " " + data['skills'] + " " + data['related_skils_in_job']

# Sử dụng TF-IDF để trích xuất đặc trưng
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['combined_text'])

# Gán nhãn cho toàn bộ dữ liệu
data['is_it'] = data['combined_text'].apply(lambda x: 1 if contains_it_keyword(x) else 0)

# Chia dữ liệu thành tập huấn luyện và kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, data['is_it'], test_size=0.2, random_state=42)

# Khởi tạo và huấn luyện mô hình Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test)

# Đánh giá mô hình
print("\nĐánh giá mô hình phân loại:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Dự đoán trên toàn bộ dữ liệu
data['is_it_predicted'] = model.predict(X)

# Lọc các bài đăng IT dựa trên dự đoán của mô hình
data_it_predicted = data[data['is_it_predicted'] == 1]

choosen_column = [
    'career_objective',
    'skills',
    'major_field_of_studies',
    'related_skils_in_job',
    'positions',
    'job_position_name',
    'responsibilities',
    'experience_requirement',

]
data_it_predicted = data_it_predicted[choosen_column]
data_it_predicted
# Lưu kết quả dự đoán
data_it_predicted.to_csv("predicted_it_jobs.csv", index=False, encoding='utf-8')
print("Lưu danh sách việc làm IT dự đoán vào predicted_it_jobs.csv")


Đánh giá mô hình phân loại:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       206
           1       1.00      1.00      1.00      1703

    accuracy                           1.00      1909
   macro avg       1.00      1.00      1.00      1909
weighted avg       1.00      1.00      1.00      1909

Lưu danh sách việc làm IT dự đoán vào predicted_it_jobs.csv
