In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load Data

In [None]:
df = pd.read_csv('/kaggle/input/pubg-app-reviews-by-version-langen/PUBG_V2.8.0_Cleaned.csv')
df.head()

## EDA & Preprocessing

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# select necessary columns
df = df.loc[:, ['userName', 'content', 'score']]

df.head()

In [None]:
# tokenization
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# hapus duplikasi
df = df.dropna(subset=['content']).drop_duplicates()

# stopwords
stop_words = stopwords.words('english')
df['content'] = df['content'].apply(lambda x: [word.lower() for word in word_tokenize(x) if (word.isalpha() and word.lower() not in stop_words)])

# normalisasi teks
df['content'] = df['content'].apply(lambda x: ' '.join(x))

df.head()

In [None]:
# stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
df['content'] = df['content'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

df.head(5)

In [None]:
!pip install afinn

In [None]:
# labelling
from afinn import Afinn

afinn = Afinn()
df['sentiment'] = df['content'].apply(lambda x: 'Positive' if afinn.score(x) > 0 else ('Negative' if afinn.score(x) < 0 else 'Neutral'))

df

In [None]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['content'])

### Sentiment Analysis

In [None]:
from wordcloud import WordCloud
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

#### WordCloud

In [None]:
df_netral = df[df['sentiment'] == 'Neutral']
all_words_netral = ' '.join([twts for twts in df_netral['content']])
wordcloud_netral = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(all_words_netral)

plt.imshow(wordcloud_netral, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud of Neutral Sentiment')
plt.show()

In [None]:
df_positif = df[df['sentiment'] == 'Positive']
all_words_positif = ' '.join([twts for twts in df_positif['content']])
wordcloud_positif = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(all_words_positif)

plt.imshow(wordcloud_positif, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud of Positive Sentiment')
plt.show()

In [None]:
df_negatif = df[df['sentiment'] == 'Negative']
all_words_negatif = ' '.join([twts for twts in df_negatif['content']])
wordcloud_negatif = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(all_words_negatif)

plt.imshow(wordcloud_negatif, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud of Negative Sentiment')
plt.show()

#### Target Distribution

In [None]:
temp = df.groupby('sentiment').count()['content'].reset_index().sort_values(by='content',ascending=False)
temp.style.background_gradient(cmap='inferno_r')

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='sentiment',data=df)

In [None]:
fig = go.Figure(go.Funnelarea(
    text =temp.sentiment,
    values = temp.content,
    title = {"position": "top center", "text": "Funnel-Chart of Target Distribution"}
    ))
fig.show()

In [None]:
!pip install palettable
from palettable.colorbrewer.qualitative import Pastel1_7

In [None]:
unique_netral_words = df_netral['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_netral_words.columns = ['words', 'count']
top_20_words = unique_netral_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot of Neutral Sentiment')
plt.show()

In [None]:
unique_positif_words = df_positif['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_positif_words.columns = ['words', 'count']
top_20_words = unique_positif_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot of Positive Sentiment')
plt.show()

In [None]:
unique_negatif_words = df_negatif['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_negatif_words.columns = ['words', 'count']
top_20_words = unique_negatif_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot of Negative Sentiment')
plt.show()

#### SPLIT

In [None]:
# splitting
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
X_train.shape, X_test.shape

#### Resampling target

resampling target is only use for modelling

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

plt.figure(figsize=(12, 6))
sns.countplot(x=y_train)
plt.title('Target Distribution for modelling')
plt.show()

## Model

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# init parameters
svm_param_grid = {'C': [0.1, 1, 10],
                  'kernel': ['linear', 'rbf', 'poly'],
                  'gamma': ['scale', 'auto']}

### Support Vector Machine

In [None]:
svm_model = RandomizedSearchCV(SVC(random_state=42), svm_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42)
svm_model.fit(X_train, y_train)

### Model Eval

In [None]:
# print best param
print("\nBest Parameters for Support Vector Machine:", svm_model.best_params_)

In [None]:
from sklearn.metrics import classification_report

y_pred_svm = svm_model.best_estimator_.predict(X_test)

print("\n\nClassification Report for Support Vector Machine (Tuned):")
print(classification_report(y_test, y_pred_svm))

Thanks for read my NB :D