In [24]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

In [25]:
dataDF = pd.read_csv('./DATA/MBTI.csv')

In [26]:
# MBTI 유형을 숫자로 변환하는 딕셔너리 생성
labels = dict(zip(dataDF['type'].unique().tolist(), range(16)))
dataDF['type'] = dataDF['type'].map(lambda x: labels[x])

# 각 MBTI 유형의 빈도를 계산하고 딕셔너리로 변환
sample_dict = {i:5000 for i in range(16)}

# SMOTE 설정
smote = SMOTE(random_state=42, sampling_strategy=sample_dict)

In [27]:
vectorizer = TfidfVectorizer(max_features=5000, min_df=3)
X_tfidf = vectorizer.fit(dataDF['posts'].to_numpy().reshape(-1))

In [28]:
# 클래스 비율 유지하여 샘플링
sampled_indices=[]
re_sample_df = pd.DataFrame(columns=['posts','type']).reset_index()
num_samples = dataDF.shape[0]//12  # 전체 샘플 수
sampled_df = dataDF[~dataDF.index.isin(sampled_indices)].groupby('type', group_keys=False).apply(lambda x: x.sample(frac=num_samples / len(dataDF), random_state=42, replace=False))
sampled_indices.extend(sampled_df.index)
targetDF = sampled_df[['type']]
featureDF = sampled_df.drop(columns='type')
featureDF = X_tfidf.transform(featureDF.to_numpy().reshape(-1)).toarray()
X_resampled, y_resampled = smote.fit_resample(featureDF, targetDF)
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(X_resampled, columns=feature_names)
lists = []
for index, row in df_tfidf.iterrows():
    highest_tfidf = row.nlargest(500)  # 상위 3개 단어
    words = ''
    for word, value in highest_tfidf.items():
        words = words + ' ' + word
    lists.append(words)
X_resampled_df = pd.DataFrame(lists,columns=['posts'])
y_resampled_df = pd.DataFrame(y_resampled.values, columns=['type'])
resampled_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)
re_sample_df = pd.concat([re_sample_df,resampled_df])
re_sample_df = re_sample_df.drop(columns='index')

In [29]:
r_labels = {value: key for key, value in labels.items()}
re_sample_df['type'] = re_sample_df['type'].map(lambda x:r_labels[x])

In [38]:
re_sample_df.to_csv('mbti_sample.csv',encoding='utf-8')

In [11]:
# re_sample_df['type'].map()
labels.key

{'INTJ': 0,
 'INTP': 1,
 'ISFJ': 2,
 'ISFP': 3,
 'ISTJ': 4,
 'ISTP': 5,
 'ENFJ': 6,
 'ENFP': 7,
 'ENTJ': 8,
 'ENTP': 9,
 'ESFJ': 10,
 'ESFP': 11,
 'ESTJ': 12,
 'ESTP': 13,
 'INFJ': 14,
 'INFP': 15}

In [None]:
dataDF['type'].value_counts().to_dict()

In [None]:
dataDF['type'].value_counts()

In [None]:
featureDF = dataDF.drop(columns='type')
featureDF.shape

In [None]:
dataset.DF_duplicated()

In [None]:
re_sample_df = pd.DataFrame(columns=['posts','type']).reset_index()
re_sample_df

In [None]:
dataset.DF_info()

In [None]:
dataDF.isna().sum()

In [141]:
dataset.target_select('type')

In [15]:
mask = dataDF['type'] == 0
type_one_rows = dataDF[mask]

# 랜덤으로 절반 선택하여 삭제
rows_to_drop = type_one_rows.sample(frac=0.4, random_state=42).index

# 데이터프레임에서 해당 행 삭제
dataDF = dataDF.drop(rows_to_drop)

In [16]:
smote = SMOTE(random_state=42,sampling_strategy=1)

In [None]:
targetDF = dataDF['type']
featureDF = dataDF['posts']
X_resampled, y_resampled = smote.fit_resample(featureDF, targetDF)
df_tfidf = pd.DataFrame([featureDF,targetDF]).T

In [144]:
import random

In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [146]:
vectorizer = TfidfVectorizer(max_features=10000,min_df=3)
X_tfidf = vectorizer.fit(dataDF['posts'].to_numpy().reshape(-1))

In [None]:
# 클래스 비율 유지하여 샘플링
sampled_indices = []

for i in range(4):
    num_samples = dataDF.shape[0]//10  # 전체 샘플 수
    # sampled_df = dataDF.groupby('type', group_keys=False).apply(lambda x: x.sample(frac=num_samples/len(dataDF), random_state=random.randint(1,42)))
    sampled_df = dataDF[~dataDF.index.isin(sampled_indices)].groupby('type', group_keys=False).apply(lambda x: x.sample(frac=num_samples / len(dataDF), random_state=42, replace=False))
    sampled_indices.extend(sampled_df.index)
    targetDF = sampled_df[['type']]
    featureDF = sampled_df.drop(columns='type')
    featureDF = X_tfidf.transform(featureDF.to_numpy().reshape(-1)).toarray()
    X_resampled, y_resampled = smote.fit_resample(featureDF, targetDF)
    feature_names = vectorizer.get_feature_names_out()
    df_tfidf = pd.DataFrame(X_resampled[y_resampled[y_resampled['type'] == 0].index], columns=feature_names)
    lists = []
    for index, row in df_tfidf.iterrows():
        highest_tfidf = row.nlargest(500)  # 상위 3개 단어
        words = ''
        for word, value in highest_tfidf.items():
            words = words + ' ' + word
        lists.append(words)
    smoteDF = pd.DataFrame(lists,columns=['posts'])
    smoteDF['type']= 0
    smoteDF
    dataDF = pd.concat([dataDF,smoteDF])

In [None]:
dataDF['type'].value_counts()

In [14]:
r_labels = {value: key for key, value in labels.items()}

In [34]:
dataDF['type'] = dataDF['type'].map(lambda x:r_labels[x])


In [36]:
dataDF['type'].dtype

dtype('O')

In [124]:
dataDF.to_csv('./DATA/MBTI_smote.csv')