In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import warnings


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("Musical_instruments_reviews.csv")
print(df.info())

In [None]:
#Verificando valores ausentes
df.isnull().sum()

print((df.isnull().sum() / len(df)) * 100)


Análise dos Valores Ausentes
    - reviewerName: 26.3% dos dados estão ausentes. Isso é uma quantidade significativa.
    - reviewText: 6.8% dos dados estão ausentes. Como essa coluna contém o texto das avaliações, é aconselhável remover essas entradas, pois são essenciais para sua análise.

In [None]:
#Tratando valores ausentes
df.dropna(subset=['reviewText'], inplace=True)
df.reset_index(drop=True, inplace=True)
df['reviewerName'].fillna('Unknown', inplace=True)

print(df.isnull().sum())


In [None]:
#Distribuição das Avaliações (overall)
plt.figure(figsize=(8, 3))
sns.countplot(data=df, x='overall')
plt.title('Distribuição das Avaliações (overall)')
plt.xlabel('Avaliação')
plt.ylabel('Frequência')
plt.show()

plt.figure(figsize=(5, 4))
sns.boxplot(data=df, y='overall')
plt.title('Boxplot de Avaliações (overall)')
plt.ylabel('Avaliação')
plt.show()


# Plotar a distribuição das avaliações ao longo do tempo
df['reviewDate'] = pd.to_datetime(df['unixReviewTime'], unit='s')
plt.figure(figsize=(12, 4))
df.set_index('reviewDate').resample('M')['overall'].mean().plot()
plt.title('Média das Avaliações ao Longo do Tempo')
plt.xlabel('Data')
plt.ylabel('Média da Avaliação')
plt.show()


In [None]:
df["Text"]  = df["summary"] + ". " + df["reviewText"] 

df.drop(["summary", "reviewText", "asin", "reviewerName", "reviewerID", "helpful", "unixReviewTime", "reviewTime"], axis=1, inplace=True)
df.head()

In [None]:
#Limpeza textual
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords = stopwords.words('english')


In [8]:
import string
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

In [None]:
df['cleaned_review'] = df['Text'].apply(clean_text)
df.head()

In [None]:
def classify_rating(rating):
    if rating >= 4.0:
        return 'positive'
    elif rating == 3.0:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['overall'].apply(classify_rating)

df.head()

In [11]:
X = df["cleaned_review"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train).toarray()
X_test_vectorized = vectorizer.transform(X_test).toarray()

In [12]:
#Lista de Modelos
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC()
} 
results = {}


In [13]:
for model_name, model in models.items():
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = {
        "accuracy": accuracy,
        "classification_report": classification_report(y_test, y_pred, output_dict=True)
    }

In [None]:
for model_name, result in results.items():
    print(f"{model_name}:")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print("Classification Report:")
    print(result['classification_report'])
    print("\n")

# Análise do Modelo
Melhor Acurácia:
A Regressão Logística teve a melhor acurácia (0.8879) entre os modelos testados.
Desempenho em Classes:
    - Classe Positiva: A maioria dos modelos apresenta bom desempenho nessa classe, especialmente a Regressão Logística e o Gradient Boosting, com F1-scores acima de 0.94.
    - Classe Neutra: Todos os modelos têm um desempenho fraco nesta classe, especialmente o SVM, que teve um F1-score de 0.0145.
    - Classe Negativa: O desempenho é consistentemente baixo para todos os modelos, especialmente o SVM, que não conseguiu prever essa classe.

In [115]:
# Removendo classe neutral
def classify_rating(rating):
    if rating >= 4.0:
        return 1
    else:
        return 0

df['sentiment'] = df['overall'].apply(classify_rating)

In [116]:
X = df["cleaned_review"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [117]:
results = {}
for model_name, model in models.items():
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = {
        "accuracy": accuracy,
        "classification_report": classification_report(y_test, y_pred, output_dict=True)
    }

In [None]:
for model_name, result in results.items():
    print(f"{model_name}:")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print("Classification Report:")
    print(result['classification_report'])
    print("\n")

In [None]:
!pip install tensorflow
!pip install transformers


In [119]:
# Tokenização
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding das sequências
max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

In [None]:
# Dividir os dados em características (X) e rótulos (y)
X = df['cleaned_review']
y = df['sentiment']

# Dividir em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Treinamento: {len(X_train)}, Teste: {len(X_test)}')


In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

# Definindo o modelo LSTM
model_lstm = Sequential()
model_lstm.add(Embedding(max_words, 100, input_length=max_length))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))  # Para 2 classes: positiva e negativa

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Treinando o modelo
model_lstm.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))


In [None]:
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

# Definindo o modelo CNN
model_cnn = Sequential()
model_cnn.add(Embedding(max_words, 100, input_length=max_length))
model_cnn.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Treinando o modelo
model_cnn.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))


In [None]:
!pip install tf-keras


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Carregar o tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizar e padronizar os dados
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_length)

# Criar datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

# Definindo o modelo BERT
model_bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Compilar o modelo
model_bert.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), 
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                   metrics=['accuracy'])

# Treinando o modelo
model_bert.fit(train_dataset, epochs=3, validation_data=test_dataset)


In [None]:
from sklearn.metrics import classification_report

# Avaliar LSTM
lstm_predictions = (model_lstm.predict(X_test_pad) > 0.5).astype("int32")
print(classification_report(y_test, lstm_predictions))

# Avaliar CNN
cnn_predictions = (model_cnn.predict(X_test_pad) > 0.5).astype("int32")
print(classification_report(y_test, cnn_predictions))

# Avaliar BERT
bert_predictions = model_bert.predict(test_dataset)
bert_predictions = tf.argmax(bert_predictions.logits, axis=1).numpy()
print(classification_report(y_test, bert_predictions))
