# 90:10(10-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.1, random_state=42, stratify=labels)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


# 80:20(20-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.2, random_state=42, stratify=labels)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9983333333333333
Precision: 0.9966694444444445
Recall: 0.9983333333333333
F1-score: 0.9975006950236309


# 70:30(30-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.3, random_state=42)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


# 60:40(40-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.4, random_state=42, stratify=labels)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9991666666666666
Precision: 0.9983340277777777
Recall: 0.9991666666666666
F1-score: 0.9987501736834793


# 50:50(50-test)

In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 파일에서 데이터 읽어오기
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# 키워드 기반 라벨 할당
def assign_label(content, keywords):
    keyword_counts = {keyword: content.count(keyword) for keyword in keywords}
    assigned_label = max(keyword_counts, key=keyword_counts.get, default='unknown')
    return assigned_label if keyword_counts[assigned_label] > 0 else 'unknown'

# 파일 경로 설정
folder_path = r'C:/Users/User/Python_data_3000'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

keywords = ['error', 'debugging', 'algorithm', 'optimization']
corpus = []  # 코퍼스 초기화
labels = []  # 라벨 초기화

# 파일별 데이터 읽어오기 및 라벨 할당
for file_path in file_paths:
    content = read_data(file_path)
    corpus.append(content)
    labels.append(assign_label(content, keywords))

# TF-IDF로 특징 추출
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(corpus)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.5, random_state=42, stratify=labels)

# SVM 모델 학습
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = svm_model.predict(X_test)

# 정확도, 정밀도, 재현율, F1-score 출력
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9986666666666667
Precision: 0.9973351111111112
Recall: 0.9986666666666667
F1-score: 0.9980004447409383


# 나이브 베이지안(Naive Bayesian) 알고리즘

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 문제 해결과 관련된 키워드 목록 정의
problem_solving_keywords = ['error', 'debugging', 'algorithm', 'optimization']

# 경로 설정
directory_path = 'C:/Users/User/Python_data_3000'

# 데이터 로드 및 레이블링
def load_and_label_questions(directory, keywords):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            language = filename.split(".txt")[0]  # 프로그래밍 언어 식별
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                for line in file:
                    line = line.strip().lower()
                    label = 'no'
                    for keyword in keywords:
                        if keyword in line:
                            label = keyword
                            break
                    data.append({"language": language, "question": line, "label": label})
    return pd.DataFrame(data)

# 나이브 베이지안 알고리즘을 사용하여 질문 분류
def classify_questions(data):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(data['question'])
    y = (data['label'] != 'no').astype(int)  # 문제 해결 관련 여부를 0, 1로 변환

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))

    return classifier, vectorizer

# 엑셀 파일로 분할하여 저장
def save_to_multiple_excel_files(data, chunk_size=1048576, base_filename='classified_questions_part'):
    number_of_chunks = len(data) // chunk_size + (1 if len(data) % chunk_size else 0)
    for i in range(number_of_chunks):
        start_index = i * chunk_size
        end_index = start_index + chunk_size
        chunk_data = data.iloc[start_index:end_index]
        filename = f"{base_filename}_{i+1}.xlsx"
        chunk_data.to_excel(filename, index=False)
        print(f"Data chunk {i+1} saved to {filename}")

# 메인 실행 함수
def main():
    data = load_and_label_questions(directory_path, problem_solving_keywords)
    classifier, vectorizer = classify_questions(data)
    save_to_multiple_excel_files(data)  # Modify this line to use the new function

# 실행
if __name__ == "__main__":
    main()

              precision    recall  f1-score   support

           0       1.00      0.99      1.00   1969279
           1       0.73      0.93      0.82     43073

    accuracy                           0.99   2012352
   macro avg       0.87      0.96      0.91   2012352
weighted avg       0.99      0.99      0.99   2012352

Data chunk 1 saved to classified_questions_part_1.xlsx
Data chunk 2 saved to classified_questions_part_2.xlsx
Data chunk 3 saved to classified_questions_part_3.xlsx
Data chunk 4 saved to classified_questions_part_4.xlsx
Data chunk 5 saved to classified_questions_part_5.xlsx
Data chunk 6 saved to classified_questions_part_6.xlsx
Data chunk 7 saved to classified_questions_part_7.xlsx
Data chunk 8 saved to classified_questions_part_8.xlsx
