## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read Files

In [None]:
final_data=pd.read_csv('https://github.com/ohgzone/file1/raw/main/aihub_coupus.csv')

final_data.head()

In [None]:
# Total 51,630
final_data.info()

## Normalize Text by removing English, Numbers, and other stuffs.

In [None]:
final_data['문장']

In [None]:
final_data['문장'].str.contains('[^가-힣 ]') # 공백과 한글 이외에도 있는 단어들일 경우 True 출력

In [None]:
final_data[final_data['문장'].str.contains('[^가-힣 ]')].values[:10] # '문장' 컬럼의 내용중에 영문, 특수문자 있는지 확인 : 영문과 특수문자 존재 확인 (period, comma, 느낌표, 마침표, 물음표, ...)

In [None]:
final_data['문장'] = final_data['문장'].str.replace(pat=r'[^가-힣 ]', repl=r'', regex=True) # '문장' 컬럼의 내용에서 숫자, 영문자, 특수문자등의 글자는 삭제처리
final_data['문장'][final_data['문장'].str.contains('[^가-힣 ]')].sum()

In [None]:
final_data.head()

In [None]:
final_data.tail()

## Preprocessing : Removing Null, duplications

In [None]:
final_data['문장'] = final_data['문장'].str.strip()

final_data.tail()

In [None]:
print(final_data.isnull().sum())
print()
print(final_data['문장'].duplicated().sum())

In [None]:
final_data.drop_duplicates(subset=['문장'], inplace=True)

final_data.info() # 51,630 -->  51,574 (51,630 - 56)

## Check label disrtribution

In [None]:
print(final_data['감정'].value_counts())

final_data['감정'].value_counts().plot(kind='bar')

## Encode label as numbers

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

final_data['감정'] = encoder.fit_transform(final_data['감정'])

print(encoder.classes_)

In [None]:
final_data.tail()

## Split input (X) & output (Y)

In [None]:
features = final_data['문장'].values
labels = final_data['감정'].values

features.shape, labels.shape

In [None]:
print(features[:3])

print(f'Max length of event word arrays : {max(len(l) for l in features):d}')
print(f'Avg length of event word arrays : {sum(map(len, features))/len(features):.4f}')

In [None]:
plt.hist([len(s) for s in features], bins=50)

plt.xlabel('length of samples')
plt.ylabel('number of samples')

plt.show()

## Split train set & test set

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    features,
    labels ,
    test_size=0.2,
    stratify=labels,
    random_state=41,
)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train[:2], y_train[:2]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

# Convert text into TF-IDF
x_train_v = tfidf.fit_transform(x_train)
x_test_v = tfidf.transform(x_test)

# Represent TF-IDF values of each words in each line
print(x_train_v)

x_train_v.shape

## Modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define Model
rfc = RandomForestClassifier() 

# Train Model
rfc.fit(x_train_v, y_train) 

rfc.score(x_test_v, y_test)

### Prediction

In [1]:
print(f'TF-IDF of first validation dataset : {x_test_v[0]}')
print(f'Inverse TF-IDF Transformation of first validation dataset : {tfidf.inverse_transform(x_test_v[:1])}')

predict = rfc.predict(x_test_v[:1])
predict, encoder.inverse_transform(predict)

NameError: name 'x_test_v' is not defined