# 90:10(10-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.1, random_state=42, stratify=labels)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


# 80:20(20-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.2, random_state=42, stratify=labels)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9983333333333333
Precision: 0.9966694444444445
Recall: 0.9983333333333333
F1-score: 0.9975006950236309


# 70:30(30-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.3, random_state=42)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


# 60:40(40-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.4, random_state=42, stratify=labels)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9991666666666666
Precision: 0.9983340277777777
Recall: 0.9991666666666666
F1-score: 0.9987501736834793


# 50:50(50-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.5, random_state=42, stratify=labels)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9986666666666667
Precision: 0.9973351111111112
Recall: 0.9986666666666667
F1-score: 0.9980004447409383
