<a href="https://colab.research.google.com/github/lyloc-logtech/ai-code/blob/main/nhandiencamxuccaunoi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [6]:
train_dataset=pd.read_csv("/content/train_sentence.csv")
test_dataset=pd.read_csv("/content/test_sentence.csv")

In [7]:
# Tách dữ liệu và nhãn từ tập huấn luyện và kiểm tra
test_data = test_dataset['Data'].to_list()
test_label = test_dataset['Label']
train_data = train_dataset['Data'].to_list()
train_label = train_dataset['Label']

In [8]:
# In số lượng mẫu theo từng nhãn trong tập train
print(train_dataset.groupby('Label').size())

Label
0    15000
1    15000
dtype: int64


In [9]:
# Đọc danh sách stopwords tiếng Việt từ file
with open("/content/drive/MyDrive/vietnamese-stopwords.txt") as f:
    stop_words = f.readlines()
stop_words = [x.strip() for x in stop_words]

In [10]:
# Loại bỏ stopword khỏi dữ liệu huấn luyện
for idx, text in enumerate(train_data):
    train_data[idx] = train_data[idx].strip().replace("\n", "")
    for sw in stop_words:
        if " " + sw + " " in text:
            train_data[idx] = train_data[idx].replace(sw, "")


In [11]:
# Khởi tạo vectorizer TF-IDF
vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.01)

In [12]:
vectorizer.fit(train_data)

In [13]:
vectorizer_train = vectorizer.transform(train_data)
vectorizer_test = vectorizer.transform(test_data)

In [14]:
LR_model = LogisticRegression()

In [15]:
from sklearn.preprocessing import MinMaxScaler,scale,RobustScaler

In [17]:
mm_scaler=MinMaxScaler()
rb_scaler=RobustScaler()

In [18]:
mm_scaler.fit(vectorizer_train.toarray())
rb_scaler.fit(vectorizer_train.toarray())

In [33]:
def get_score(clf,X_train,X_test,y_train,y_test):
    clf.fit(X_train,y_train)
    print(clf.score(X_test,y_test))


In [19]:
train_mmscaled = mm_scaler.transform(vectorizer_train.toarray())
train_rbscaled = rb_scaler.transform(vectorizer_train.toarray())

In [20]:
x_train, x_val, y_train, y_val = train_test_split(vectorizer_train, train_label, test_size=0.2)

x_train_rbscaled, x_val_rbscaled, y_train_rbscaled, y_val_rbscaled = train_test_split(
    train_rbscaled, train_label, test_size=0.2
)

x_train_mmscaled, x_val_mmscaled, y_train_mmscaled, y_val_mmscaled = train_test_split(
    train_mmscaled, train_label, test_size=0.2
)

In [21]:

get_score(LR_model, x_train, x_val, y_train, y_val)
get_score(LR_model, x_train_mmscaled, x_val_mmscaled, y_train_mmscaled, y_val_mmscaled)
get_score(LR_model, x_train_rbscaled, x_val_rbscaled, y_train_rbscaled, y_val_rbscaled)

1.0
1.0
1.0


In [22]:
l1_param_grid = {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': np.logspace(-3, 3, 7)}
l2_param_grid = {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'C': np.logspace(-3, 3, 7)}

In [23]:
l1_grid_model = GridSearchCV(LogisticRegression(), l1_param_grid)
l2_grid_model = GridSearchCV(LogisticRegression(), l2_param_grid)


In [24]:

l1_grid_model.fit(x_train, y_train)
l2_grid_model.fit(x_train, y_train)



In [32]:
print(l1_grid_model.best_estimator_)
print(l1_grid_model.best_score_)
print(l2_grid_model.best_estimator_)
print(l2_grid_model.best_score_)

LogisticRegression(C=np.float64(0.01), penalty='l1', solver='liblinear')
1.0
LogisticRegression(C=np.float64(0.001), solver='newton-cg')
1.0


In [26]:
LR_model.set_params(penalty="l1", solver="saga", C=0.1)

In [27]:
LR_model.fit(vectorizer_train, train_label)

In [28]:
from sklearn.metrics import classification_report

In [29]:
y_pred=LR_model.predict(vectorizer_test)
print(classification_report(test_label,y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91      1000
           1       1.00      0.80      0.89      1000

    accuracy                           0.90      2000
   macro avg       0.92      0.90      0.90      2000
weighted avg       0.92      0.90      0.90      2000



In [30]:
test =[ "tôi rất thất vọng với chất lượng món ăn "]
x=vectorizer.transform(test)
print(LR_model.predict(x))
print(LR_model.predict_proba(x))

[0]
[[0.92194372 0.07805628]]


In [34]:
test =[ "phục vụ rất tốt "]
x=vectorizer.transform(test)
print(LR_model.predict(x))
print(LR_model.predict_proba(x))

[1]
[[0.0100362 0.9899638]]
