# 오버워치2 리뷰 감성분석

## 데이터 준비
* 데이터프레임 : user_reviews (유저리뷰)
* 열이름
    * text : 리뷰본문
    * label : 0(negative) | 1(positive)

In [1]:
import pandas as pd

In [2]:
user_reviews = pd.read_csv("최종_유저데이터4차.csv", encoding='utf-8-sig')

In [3]:
# id와 리뷰 본문만 추출
text = user_reviews[['SteamID64','text']]

## 라벨 컬럼 추가
* VADER 약라벨 생성
    * VADER? - 수작업으로 라벨링된 데이터가 없는 상황에서 긍/부정을 빠르게 분류
    * 학습에 필요한 모델이 아님
        * 전체 데이터에 적용하여 감성 점수 부여.
        * 이 점수를 나중에 지도 학습 모델의 훈련 데이터로 활용 가능

In [4]:
import pandas as pd
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [5]:
# VADER 준비 (최초 1회만 필요)
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
# 함수: 감성 점수 → 라벨
def vader_label(text):
    comp = sia.polarity_scores(str(text))['compound']
    if comp >= 0.05:
        return 'positive'
    elif comp <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [7]:
# 약라벨 컬럼 생성
text['vader_label'] = text['text'].apply(vader_label)
text['compound'] = text['text'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['vader_label'] = text['text'].apply(vader_label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['compound'] = text['text'].apply(lambda x: sia.polarity_scores(str(x))['compound'])


In [8]:
text.head()

Unnamed: 0,SteamID64,text,vader_label,compound
0,76561198089026294,really bad,negative,-0.5849
1,76561198840751916,wen game,neutral,0.0
2,76561199758331866,ww,neutral,0.0
3,76561198309660819,"Really annoying sometimes, really fun sometimes",positive,0.1531
4,76561199125555677,I love this balanced game and matchmaking is a...,positive,0.8402


In [9]:
text = text.dropna(subset=["text"]).reset_index(drop=True)

In [10]:
# 절대값이 작은 샘플은 제거해서 신뢰도를 높임
text_filt = text[text["compound"].abs() >= 0.2].reset_index(drop=True)

In [11]:
# neutral 라벨 제거 - 긍정/부정만 남은 이진 분류 전용 데이터프레임을 만듦
bin_df = text_filt[text_filt["vader_label"] != "neutral"].reset_index(drop=True)

In [12]:
# nagative:0, positive:1로 고치기 -  딥러닝 모델 학습용 레이블로 사용 가능
bin_df["label_id"] = bin_df["vader_label"].map({"negative":0, "positive":1})

In [13]:
# 데이터 긍/부정 비율 확인
print(bin_df["vader_label"].value_counts())

vader_label
positive    5090
negative    3422
Name: count, dtype: int64


## train/test로 나누기

* train_df : 학습용 80%
* valid_df : 검증용 10%
* test_df : 최종 평가용 10%

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# train떼어내기
train_df, temp_df = train_test_split(
    bin_df, test_size=0.2, random_state=42, stratify=bin_df["label_id"])

In [16]:
# temp_df를 valid/test로 나누기
valid_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df["label_id"])

### 전처리

In [17]:
EMOJI_RE = re.compile(
    "[" 
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags
    u"\U00002700-\U000027BF"
    u"\U0001F900-\U0001F9FF"
    u"\U00002600-\U000026FF"
    u"\U00002B00-\U00002BFF"
    u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE
)

URL_RE      = re.compile(r"(https?://\S+|www\.\S+)")
EMAIL_RE    = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b")
HTML_TAG_RE = re.compile(r"<[^>]+>")
USER_MENT_RE= re.compile(r"@\w+")        # @user
HASHTAG_RE  = re.compile(r"#(\w+)")      # #word -> word
WS_RE       = re.compile(r"\s+")

In [19]:
def clean_text(s: str) -> str:
    s = str(s)
    s = URL_RE.sub(" ", s)
    s = EMAIL_RE.sub(" ", s)
    s = HTML_TAG_RE.sub(" ", s)
    s = USER_MENT_RE.sub(" ", s)
    s = HASHTAG_RE.sub(r"\1", s)         # 해시태그 기호만 제거, 단어는 보존
    s = EMOJI_RE.sub(" ", s)
    # 스마트 따옴표 등 통일(선택)
    s = s.replace("’","'").replace("“","\"").replace("”","\"")
    # 소문자화 (DistilBERT-uncased와 일관성)
    s = s.lower()
    # 공백 정리
    s = WS_RE.sub(" ", s).strip()
    return s

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

# 파이프라인 정의 + 학습
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2, max_df=0.95,
        sublinear_tf=True,
        stop_words="english"   # 영어면 유지, 아니면 제거
    )),
    ("clf", LinearSVC(class_weight="balanced", random_state=42))
])

pipe.fit(train_df["text_clean"], train_df["label_id"])

# 평가(원하면 valid/test 둘 다)
val_pred  = pipe.predict(valid_df["text_clean"])
test_pred = pipe.predict(test_df["text_clean"])

print("VALID acc:", accuracy_score(valid_df["label_id"], val_pred), "f1:", f1_score(valid_df["label_id"], val_pred))
print("TEST  acc:", accuracy_score(test_df["label_id"],  test_pred), "f1:", f1_score(test_df["label_id"],  test_pred))
print(classification_report(test_df["label_id"], test_pred, digits=4))


VALID acc: 0.8648648648648649 f1: 0.8851148851148851
TEST  acc: 0.8615023474178404 f1: 0.8805668016194332
              precision    recall  f1-score   support

           0     0.8016    0.8717    0.8352       343
           1     0.9081    0.8546    0.8806       509

    accuracy                         0.8615       852
   macro avg     0.8549    0.8632    0.8579       852
weighted avg     0.8653    0.8615    0.8623       852



In [24]:
# 각 split에 전처리 열 추가 (원문은 보존)
for _df in (train_df, valid_df, test_df):
    _df["text_clean"] = _df["text"].apply(clean_text)
    # 길이 0된 건 드랍
    _df.dropna(subset=["text_clean"], inplace=True)
    _df = _df[_df["text_clean"].str.len() > 0].reset_index(drop=True)

In [25]:
train_df[["text", "text_clean", "label_id"]].head(5)

Unnamed: 0,text,text_clean,label_id
8112,Reinstall the game after years of not playing ...,reinstall the game after years of not playing ...,0
4597,CNM SB Blizzard Log in to a game and bind the ...,cnm sb blizzard log in to a game and bind the ...,0
7060,There is no queue at all,there is no queue at all,0
3030,Product Received For Free It's Overwatch righ...,product received for free it's overwatch right...,1
5401,The first episode was much better,the first episode was much better,1


In [26]:
test_pred = pipe.predict(test_df["text_clean"])
print(accuracy_score(test_df["label_id"], test_pred))
print(classification_report(test_df["label_id"], test_pred))

0.8615023474178404
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       343
           1       0.91      0.85      0.88       509

    accuracy                           0.86       852
   macro avg       0.85      0.86      0.86       852
weighted avg       0.87      0.86      0.86       852



* 정확도 : 약 0.86
* 1(긍정)예측이 0(부정)예측보다 조금 더 정확함

In [None]:
# 저장
train_df.to_pickle("20. train_df.pkl", protocol=5)
valid_df.to_pickle("20. valid_df.pkl", protocol=5)
test_df.to_pickle("20. test_df.pkl", protocol=5)

# 로드
# train_df = pd.read_pickle("20. train_df.pkl")
# valid_df = pd.read_pickle("20. valid_df.pkl")
# test_df  = pd.read_pickle("20. test_df.pkl")