<a href="https://colab.research.google.com/github/miracle65536/Sentiment-Classification/blob/main/sentiment_analysis(SVM_Three_Class_Classification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.calibration import LabelEncoder
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import torch
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from mlxtend.plotting import plot_decision_regions
from sklearn.decomposition import PCA

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:{}".format(device))

device:cuda


In [None]:
df = pd.read_excel('/content/drive/MyDrive/comments_3.xlsx')
BERT_PATH = '/content/drive/MyDrive/bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
# new_tokens = ['c罗', 'b费', 'b席', 'big6']
# num_added_toks = tokenizer.add_tokens(new_tokens)

def clean_punc(df):
    def remove_punctuation(text):
        pattern = r'[\W_\u2460-\u2469⒈-⒏\u00BD₀-₉⑴-⑸²-⁹]'
        cleaned_text = re.sub(pattern, '', text)
        return cleaned_text

    df['comm_cont'] = df['comm_cont'].astype(str)
    df['comm_cont'] = df['comm_cont'].apply(remove_punctuation)
    df = df[df['comm_cont'].notnull()]
    df = df[df['comm_cont'].str.strip() != '']
    return df


df = clean_punc(df)
print(len(df))

df = df.dropna(subset=['comm_cont', 'comm_sent'], how='any')
print(len(df))

label_counts = df['comm_sent'].value_counts()
positive_count = label_counts['正向']
negative_count = label_counts['负向']
neutral_count = label_counts['中性']
# ratio = positive_count / negative_count

print("pos:", positive_count)
print("neg:", negative_count)
print("neu:", neutral_count)
# print("pos:neg", ratio)

label_encoder = LabelEncoder()

df['comm_sent_encoded'] = label_encoder.fit_transform(df['comm_sent'])

label_mapping = {
    label: encoded_label
    for label, encoded_label in zip(
        label_encoder.classes_,
        label_encoder.transform(label_encoder.classes_)
    )
}

for label, encoded_label in label_mapping.items():
    print(f"Label: {label}  Encoded Label: {encoded_label}")

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.texts = df['comm_cont']
        self.labels = df['comm_sent_encoded']

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        return text, label


train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_df = train_df.reset_index(drop=True)
print("training set", len(train_df))
val_df = val_df.reset_index(drop=True)
print("validation set", len(val_df))

train_dataset = MyDataset(train_df)
val_dataset = MyDataset(val_df)


for i in range(len(train_dataset)):
    break

text, label = train_dataset[i]
print(f"样例 {i+1}:")
print("Text:", text)
print("Label:", label)

def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]
    encoded_inputs = tokenizer(
            sents,
            truncation=True,
            padding='max_length',
            max_length=256,
            return_tensors='pt'
        )
    input_ids = encoded_inputs['input_ids'].to(device)
    attention_mask = encoded_inputs['attention_mask'].to(device)
    token_type_ids = encoded_inputs['token_type_ids'].to(device)


    return input_ids, attention_mask, token_type_ids, labels

def get_loader(dataset):
    loader = DataLoader(
        dataset=dataset,
        collate_fn=collate_fn,
        shuffle=True,
        drop_last=True
    )

    return loader

train_loader = get_loader(train_dataset)
valid_loader = get_loader(val_dataset)

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
  break
print(input_ids.shape, attention_mask.shape, token_type_ids.shape, labels)

print(len(train_loader))

2013
2010
pos: 629
neg: 1299
neu: 82
Label: 中性  Encoded Label: 0
Label: 正向  Encoded Label: 1
Label: 负向  Encoded Label: 2
training set 1809
validation set 201
样例 1:
Text: 库里还是神勇
Label: 1
torch.Size([1, 256]) torch.Size([1, 256]) torch.Size([1, 256]) [2]
1809


In [None]:
class BertCSLModel(nn.Module):
    def __init__(self):
        super(BertCSLModel, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH).to(device)

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.bert(input_ids=input_ids,
                 attention_mask=attention_mask,
                 token_type_ids=token_type_ids
                )
        pooled_output = out.pooler_output
        return pooled_output


model = BertCSLModel()
model.to(device)

print(model(input_ids=input_ids,
     attention_mask=attention_mask,
     token_type_ids=token_type_ids).shape)


torch.Size([1, 768])


In [None]:
train_features = []
val_features = []
train_labels = []
val_labels = []

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(train_loader):
        output = model(input_ids, attention_mask, token_type_ids)
        output = output.detach()
        output = output.cpu()
        output = output.numpy()
        train_features.extend(output)
        train_labels.extend(labels)

# print(train_features[0])
# print(train_labels)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(valid_loader):
        output = model(input_ids, attention_mask, token_type_ids)
        output = output.detach()
        output = output.cpu()
        output = output.numpy()
        val_features.extend(output)
        val_labels.extend(labels)


train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
train_features = np.array(train_features)
val_features = np.array(val_features)

print("feature_shape:{}".format(train_features.shape))
print(val_features.shape)
print("labels_shape:{}".format(train_labels.shape))


feature_shape:(1809, 768)
(201, 768)
labels_shape:(1809,)


In [None]:
combined_features = np.vstack((train_features, val_features))
combined_labels = np.hstack((train_labels, val_labels))

print(combined_features.shape)
print(combined_labels.shape)

param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}

svm_model = SVC(kernel='rbf')
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, refit = True, verbose = 3)

grid_search.fit(combined_features, combined_labels)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

predictions = best_model.predict(combined_features)
train_report = classification_report(combined_labels, predictions)
print(train_report)

# combined_features = np.vstack((train_features, val_features))
# combined_labels = np.hstack((train_labels, val_labels))

# pca = PCA(n_components=2)
# combined_features_reduced = pca.fit_transform(combined_features)

# param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
# svm_model = SVC(kernel='rbf')
# grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, refit=True, verbose=3)
# grid_search.fit(combined_features_reduced, combined_labels)

# best_model = grid_search.best_estimator_
# best_params = grid_search.best_params_
# print("Best model:", best_model)
# print("Best parameters:", best_params)

# plot_decision_regions(combined_features_reduced, combined_labels, clf=best_model, legend=2)

# plt.title('Decision Boundary')

# plt.show()

(2010, 768)
(2010,)
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ....................C=0.1, gamma=1;, score=0.647 total time=   1.5s
[CV 2/5] END ....................C=0.1, gamma=1;, score=0.647 total time=   1.6s
[CV 3/5] END ....................C=0.1, gamma=1;, score=0.647 total time=   1.5s
[CV 4/5] END ....................C=0.1, gamma=1;, score=0.647 total time=   1.5s
[CV 5/5] END ....................C=0.1, gamma=1;, score=0.644 total time=   1.6s
[CV 1/5] END ..................C=0.1, gamma=0.1;, score=0.647 total time=   1.8s
[CV 2/5] END ..................C=0.1, gamma=0.1;, score=0.647 total time=   2.9s
[CV 3/5] END ..................C=0.1, gamma=0.1;, score=0.647 total time=   1.5s
[CV 4/5] END ..................C=0.1, gamma=0.1;, score=0.647 total time=   1.5s
[CV 5/5] END ..................C=0.1, gamma=0.1;, score=0.644 total time=   1.5s
[CV 1/5] END .................C=0.1, gamma=0.01;, score=0.647 total time=   1.0s
[CV 2/5] END ..............

# New Section