- **Nama:** Althaf Yudhistira
- **Email:** althafbanfsaj@gmail.com
- **ID Dicoding:** althafby

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv('D:/project_1_dicoding/data/clean_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6424 entries, 0 to 6423
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   steam_id            6424 non-null   int64 
 1   review_id           6424 non-null   int64 
 2   review_text         6424 non-null   object
 3   voted_up            6424 non-null   bool  
 4   playtime_recorded   6424 non-null   int64 
 5   playtime_at_review  6424 non-null   int64 
dtypes: bool(1), int64(4), object(1)
memory usage: 257.3+ KB


In [3]:
df=df.drop(columns=['steam_id','review_id','playtime_at_review'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6424 entries, 0 to 6423
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   review_text        6424 non-null   object
 1   voted_up           6424 non-null   bool  
 2   playtime_recorded  6424 non-null   int64 
dtypes: bool(1), int64(1), object(1)
memory usage: 106.8+ KB


In [4]:
df['voted_up'].value_counts()

voted_up
True     5877
False     547
Name: count, dtype: int64

In [5]:
def has_non_latin(text):
    if not isinstance(text, str):
        return False
    return any(ord(char) > 0x7F for char in text)
df['has_non_latin'] = df['review_text'].apply(has_non_latin)

count = df['has_non_latin'].sum()
matching_texts = df[df['has_non_latin']]['review_text'].tolist()

print(f"Texts with non-Basic Latin characters: {count}")
print("Matching texts:", matching_texts)


Texts with non-Basic Latin characters: 17
Matching texts: ['its aight _ツ_ ive been addicted to this series since third gen', 'id finally found an anjanath id been hunting for a bit and right as i was about to begin attacking it a deviljho waltzed into the forest and picked up the anjanath in its mouth mercilessly swinging it around and smacking it into things with the carelessness of a clumsy child before rolling it at me like a bowling ball killing me instantly 1010 ω', 'its a fun game i wish it would have been more fun 재미는 어느 정도 있는 게임 더 재미 있었으면 좋았을 그런 게임', 'well great story and my friend is here also and hes asking for some pets and click like to pet her once フ _ _ l ミxノ ヽ ﾉ ヽ_ヽ___ 二つ', 'if you love to beat the absolute bricks off monsters the size of the smallest mom car this is the game to do that in in addition the drip is immaculate and you get a cat that is sometimes better than your real human teammates game has it all müsst ihr wissen', 'i cant stop playing world its just too 

In [6]:

def clean_text(text):
    if not isinstance(text, str): 
        return text
    

    text = text.strip()  
    text = re.sub(r'[^\x00-\xFF]', '', text)  
    text = re.sub(r"http\S+|www\.\S+", "", text, flags=re.IGNORECASE)  
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)  
    text = re.sub(r"\b(?:steam|github|http|www|monsterhunter)\w*\b", "", text, flags=re.IGNORECASE)  

    return text



# Apply cleaning to all columns
df["clean_text"] = df["review_text"].apply(clean_text)

In [7]:
def has_non_latin(text):
    if not isinstance(text, str):
        return False
    return any(ord(char) > 0x7F for char in text)

# Apply the function to the DataFrame
df['has_non_latin'] = df['clean_text'].apply(has_non_latin)

# Count and list matches
count = df['has_non_latin'].sum()
matching_texts = df[df['has_non_latin']]['review_text'].tolist()

print(f"Texts with non-Basic Latin characters: {count}")
print("Matching texts:", matching_texts)
df.drop(columns=['has_non_latin'], inplace=True)

Texts with non-Basic Latin characters: 0
Matching texts: []


In [8]:
df.head()

Unnamed: 0,review_text,voted_up,playtime_recorded,clean_text
0,accidentally left this game afk for a month,True,62712,accidentally left this game afk for a month
1,theres some content in it,True,18323,theres some content in it
2,great game lots of fun iceborne is a must was ...,True,48217,great game lots of fun iceborne is a must was ...
3,you beat shara ishvalda and thought you finish...,True,13316,you beat shara ishvalda and thought you finish...
4,nice game i enjoy dying all the time,True,2652,nice game i enjoy dying all the time


In [9]:
import re


def slang_handle(text):

    slang_map = {
        r"\b(u)\b": "you",
        r"\b(im)\b": "i am",
        r"\b(ur)\b": "your",
        r"\b(peak)\b": "outstanding",
        r"\b(mh)\b": "monster hunter",
        r"\b(mhw)\b": "monster hunter world",
        r"\b(1010)\b": "perfect",
        r"\b(grind)\b": "repetitive",
        r"\b(loop)\b": "repetitive",
        r"\b(gear)\b": "equipment",
        r"\b(bos)\b": "boss",
        r"\b(dlc)\b": "downloadable content",
        r"\b(coop)\b": "co-op",
        r"\b(bonk)\b": "hit",
        r"\b(bonkers)\b": "fun"
    }
    
    for pattern, replacement in slang_map.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    text = re.sub(r"\b\d+\b", "", text) 

    return text

df["clean_text"] = df["clean_text"].apply(slang_handle)

In [10]:
df.head()

Unnamed: 0,review_text,voted_up,playtime_recorded,clean_text
0,accidentally left this game afk for a month,True,62712,accidentally left this game afk for a month
1,theres some content in it,True,18323,theres some content in it
2,great game lots of fun iceborne is a must was ...,True,48217,great game lots of fun iceborne is a must was ...
3,you beat shara ishvalda and thought you finish...,True,13316,you beat shara ishvalda and thought you finish...
4,nice game i enjoy dying all the time,True,2652,nice game i enjoy dying all the time


## Auto Labeling Dengan VADER

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
lexicon = sid.lexicon
lexicon.update({ 
    'grind': -2.5,
    'grinding': -2.3,
    'clunky': -2.7,
    'bug': -2.1,
    'slow': -1.8,
    'fucking': -3.0,
    'garbage': -3.5,
    'tedious': -2.4,
    'trash' : -3.2,
    'boring': -3.5,
    'overwhelming': -1.7,
    'immersive': 2.9,
    'incredible': 3.1,
    'phenomenal': 3.4,
    'addicting': 2.2,
    'addictive': 2.3,
    'outstanding': 3,

    


})

In [12]:
df.head()

Unnamed: 0,review_text,voted_up,playtime_recorded,clean_text
0,accidentally left this game afk for a month,True,62712,accidentally left this game afk for a month
1,theres some content in it,True,18323,theres some content in it
2,great game lots of fun iceborne is a must was ...,True,48217,great game lots of fun iceborne is a must was ...
3,you beat shara ishvalda and thought you finish...,True,13316,you beat shara ishvalda and thought you finish...
4,nice game i enjoy dying all the time,True,2652,nice game i enjoy dying all the time


In [13]:
df['compound'] = df['clean_text'].apply(
    lambda x: sid.polarity_scores(x)['compound']
)

def auto_label(row):
    if row['voted_up'] == 1: 
        if row['compound'] >= -0.15:  
            return 'positive'  
        else:
            return 'neutral'
    else:  
        if row['compound'] <= 0.15:  
            return 'negative'
        else:
            return 'neutral'

df['auto_label'] = df.apply(auto_label, axis=1)

-0.05 0.3

In [14]:
conflict_stats = df.groupby('voted_up')['auto_label'].value_counts(normalize=True)
print(conflict_stats)

voted_up  auto_label
False     negative      0.692870
          neutral       0.307130
True      positive      0.871703
          neutral       0.128297
Name: proportion, dtype: float64


In [15]:
print(df['auto_label'].value_counts(normalize=True))

auto_label
positive    0.797478
neutral     0.143524
negative    0.058998
Name: proportion, dtype: float64


In [16]:
print(df['auto_label'].value_counts())

auto_label
positive    5123
neutral      922
negative     379
Name: count, dtype: int64


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6424 entries, 0 to 6423
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   review_text        6424 non-null   object 
 1   voted_up           6424 non-null   bool   
 2   playtime_recorded  6424 non-null   int64  
 3   clean_text         6424 non-null   object 
 4   compound           6424 non-null   float64
 5   auto_label         6424 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 257.3+ KB


In [18]:
df=df.drop(columns=['compound','review_text','voted_up'])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6424 entries, 0 to 6423
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   playtime_recorded  6424 non-null   int64 
 1   clean_text         6424 non-null   object
 2   auto_label         6424 non-null   object
dtypes: int64(1), object(2)
memory usage: 150.7+ KB


In [20]:
X= df['clean_text']
y=df['auto_label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,          
    random_state=42
)

## Feature Selection TF/IDF

In [21]:
vectorizer = TfidfVectorizer(
    max_features=5000,   
    min_df=2,            
    max_df=0.8,          
   stop_words='english',
   sublinear_tf=True
  
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test) 

In [22]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_tfidf, y_train)

## Word Embbeding GloVe

In [23]:

def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_path = 'glove.6B/glove.6B.100d.txt' 
glove_embeddings = load_glove_embeddings(glove_path)
embedding_dim = 100  

In [24]:
def text_to_glove_vector(text, embeddings, dim):
    words = text.split()
    vectors = []
    for word in words:
        if word in embeddings:
            vectors.append(embeddings[word])
    if len(vectors) == 0:
        return np.zeros(dim) 
    return np.mean(vectors, axis=0)


X_train_glove = np.array([text_to_glove_vector(text, glove_embeddings, embedding_dim) for text in X_train])
X_test_glove = np.array([text_to_glove_vector(text, glove_embeddings, embedding_dim) for text in X_test])


## Logistic Regression dengan TF/IDF mengguanakan class weight balanced

In [25]:

logistic_regression = LogisticRegression(C=1, max_iter=1000,class_weight='balanced',
                   solver='liblinear')
     
    # Melatih model Logistic Regression pada data pelatihan
logistic_regression.fit(X_train_tfidf, y_train)     
    # Prediksi sentimen pada data pelatihan dan data uji
y_pred_train_lr = logistic_regression.predict(X_train_tfidf)
y_pred_test_lr = logistic_regression.predict(X_test_tfidf)
 
     
    # Evaluasi akurasi model Logistic Regression pada data pelatihan
accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)
    # Evaluasi akurasi model Logistic Regression pada data uji
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)
     
    # Menampilkan akurasi
print('Logistic Regression - accuracy_train:', f"{accuracy_test_lr:.2f}")
print('Logistic Regression - accuracy_test:', f"{accuracy_test_lr:.2f}")
print(classification_report(y_test, y_pred_test_lr))



Logistic Regression - accuracy_train: 0.85
Logistic Regression - accuracy_test: 0.85
              precision    recall  f1-score   support

    negative       0.65      0.47      0.55        76
     neutral       0.57      0.43      0.49       184
    positive       0.90      0.95      0.92      1025

    accuracy                           0.85      1285
   macro avg       0.71      0.62      0.66      1285
weighted avg       0.83      0.85      0.84      1285



## hyperparameter tuning

In [26]:

param_grid = {
    'C': [0.01, 0.1, 1],          
    'penalty': ['l1', 'l2'],      
    'class_weight': ['balanced', None]
}

grid = GridSearchCV(
    LogisticRegression(solver='liblinear', max_iter=1000),
    param_grid,
    cv=5,
    scoring='f1_macro'  
)
grid.fit(X_train_tfidf, y_train)

## Random Forest dengan GloVe mengguanakan class weight balanced

In [27]:
RF = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    class_weight='balanced',
    random_state=42
)
RF.fit(X_train_glove, y_train)

y_pred = RF.predict(X_test_glove)
y_pred_train_lr = RF.predict(X_train_glove)
y_pred_test_lr = RF.predict(X_test_glove)
    # Evaluasi akurasi model Logistic Regression pada data pelatihan
accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)

    # Evaluasi akurasi model Logistic Regression pada data uji
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)
print('RF - accuracy_train:', f"{accuracy_train_lr:.2f}")
print('RF - accuracy_test:', f"{accuracy_test_lr:.2f}")
print(classification_report(y_test, y_pred))



RF - accuracy_train: 1.00
RF - accuracy_test: 0.81
              precision    recall  f1-score   support

    negative       0.50      0.03      0.05        76
     neutral       1.00      0.08      0.14       184
    positive       0.81      1.00      0.89      1025

    accuracy                           0.81      1285
   macro avg       0.77      0.37      0.36      1285
weighted avg       0.82      0.81      0.74      1285



## Random Forest dengan GloVe mengguanakan class weight SMOTE

In [28]:
RF = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    random_state=42
)
RF.fit(X_res, y_res)

y_pred = RF.predict(X_test_tfidf)
y_pred_train_lr = RF.predict(X_res)
y_pred_test_lr = RF.predict(X_test_tfidf)
    # Evaluasi akurasi model Logistic Regression pada data pelatihan
accuracy_train_lr = accuracy_score(y_pred_train_lr, y_res)

    # Evaluasi akurasi model Logistic Regression pada data uji
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)
print('RF - accuracy_train:', f"{accuracy_train_lr:.2f}")
print('RF - accuracy_test:', f"{accuracy_test_lr:.2f}")
print(classification_report(y_test, y_pred))



RF - accuracy_train: 0.89
RF - accuracy_test: 0.73
              precision    recall  f1-score   support

    negative       0.43      0.42      0.42        76
     neutral       0.31      0.55      0.40       184
    positive       0.91      0.79      0.84      1025

    accuracy                           0.73      1285
   macro avg       0.55      0.59      0.56      1285
weighted avg       0.79      0.73      0.75      1285



## Inference

In [29]:

import joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

joblib.dump(logistic_regression, 'sentiment_model.joblib')

['sentiment_model.joblib']