In [1]:
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_excel('sample.xlsx')
df.head()

Unnamed: 0,comment,recommend
0,واقعا عالیه. من که ازش خیلی راضیم,\N
1,سلام، قبل اینکه نظرم رو بگم میخواستم به یک موض...,recommended
2,گیره های فلزی خیلی سخت تا میشوند و لذا حوله را...,not_recommended
3,همه چیز در رابطه با ظاهر این گوشی بسیار خوب اس...,no_idea
4,اگر ظرفیتش براتون کافیه حتما بخرید._x000D_\nیه...,no_idea


In [4]:
df.head()

Unnamed: 0,comment,recommend
0,واقعا عالیه. من که ازش خیلی راضیم,\N
1,سلام، قبل اینکه نظرم رو بگم میخواستم به یک موض...,recommended
2,گیره های فلزی خیلی سخت تا میشوند و لذا حوله را...,not_recommended
3,همه چیز در رابطه با ظاهر این گوشی بسیار خوب اس...,no_idea
4,اگر ظرفیتش براتون کافیه حتما بخرید._x000D_\nیه...,no_idea


In [5]:
df.columns = ['text', 'label']

In [6]:
df.shape

(9999, 2)

In [7]:
df.isnull().sum()

text     8
label    0
dtype: int64

In [8]:
df.dropna(subset=['text', 'label'], inplace=True)

In [9]:
df.shape

(9991, 2)

In [10]:
df['label'].value_counts()

recommended        3860
\N                 3505
not_recommended    1584
no_idea            1042
Name: label, dtype: int64

In [11]:
df = df[~df.label.isin(["\\N"])]

In [12]:
df.reset_index(drop=True, inplace=True)

In [13]:
def vectorize_tfidf(df_X):
    tf_idf_vectorizer = TfidfVectorizer(lowercase=False, 
                                        ngram_range=(1, 1),
                                        max_df=.65,
                                        min_df=8,
                                        binary=False,
                                        norm='l2',
                                        use_idf=True,
                                        smooth_idf=True,
                                        sublinear_tf=False)
    tf_idf_X = tf_idf_vectorizer.fit_transform(df_X)
    tf_idf_df = pd.DataFrame(data=tf_idf_X.toarray(),columns=[tf_idf_vectorizer.get_feature_names_out()])
    return tf_idf_df, tf_idf_vectorizer

In [14]:
def define_and_run_model(clf, X_train, X_test, y_train):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

In [15]:
def show_result(y_test,y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))
    print(accuracy_score(y_test, y_pred))

In [16]:
possible_labels = df.label.unique()

In [17]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [18]:
df['label'] = df.label.replace(label_dict)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df.index.values,
                                                    df.label.values,
                                                    test_size=.2, 
                                                    random_state=170, 
                                                    stratify=df.label.values)

In [20]:
df['data_type'] = ['not_set']*df.shape[0]

In [21]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_test, 'data_type'] = 'test'

In [22]:
df.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
label,data_type,Unnamed: 2_level_1
0,test,772
0,train,3088
1,test,317
1,train,1267
2,test,209
2,train,833


In [23]:
tf_idf_df, tf_idf_model = vectorize_tfidf(list(df['text']))

In [24]:
y_train = df.loc[X_train, 'label']
y_test = df.loc[X_test, 'label']

X_train = tf_idf_df.loc[X_train, :]
X_test = tf_idf_df.loc[X_test, :]

In [25]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [26]:
y_train.value_counts()

0    3088
1    1267
2     833
Name: label, dtype: int64

In [27]:
y_test.value_counts()

0    772
1    317
2    209
Name: label, dtype: int64

In [28]:
%%time
smodel = SVC(kernel='rbf')
pred_values_svm = define_and_run_model(smodel, X_train, X_test, y_train)
show_result(pred_values_svm, y_test)

[[735 108 160]
 [ 35 207  43]
 [  2   2   6]]
              precision    recall  f1-score   support

           0      0.952     0.733     0.828      1003
           1      0.653     0.726     0.688       285
           2      0.029     0.600     0.055        10

    accuracy                          0.730      1298
   macro avg      0.545     0.686     0.524      1298
weighted avg      0.879     0.730     0.791      1298

0.7303543913713405
Wall time: 1min 4s


In [29]:
%%time
lr_model = LogisticRegression()
pred_values_lr = define_and_run_model(lr_model, X_train, X_test, y_train)
show_result(pred_values_lr, y_test)

[[708  83 137]
 [ 43 213  46]
 [ 21  21  26]]
              precision    recall  f1-score   support

           0      0.917     0.763     0.833       928
           1      0.672     0.705     0.688       302
           2      0.124     0.382     0.188        68

    accuracy                          0.730      1298
   macro avg      0.571     0.617     0.570      1298
weighted avg      0.819     0.730     0.765      1298

0.7295839753466872
Wall time: 2.05 s


In [30]:
%%time
rf_model = RandomForestClassifier(n_estimators=100)
pred_values_rf = define_and_run_model(rf_model, X_train, X_test, y_train)
show_result(pred_values_rf, y_test)

[[731 130 156]
 [ 39 186  45]
 [  2   1   8]]
              precision    recall  f1-score   support

           0      0.947     0.719     0.817      1017
           1      0.587     0.689     0.634       270
           2      0.038     0.727     0.073        11

    accuracy                          0.713      1298
   macro avg      0.524     0.712     0.508      1298
weighted avg      0.864     0.713     0.773      1298

0.7126348228043143
Wall time: 10.6 s
