In [28]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import pickle
import time

In [39]:
df = pd.read_csv("datasets/CleanedTrainingDataSetLOCAL.csv").dropna()

In [40]:
X = df['preprocessed_text'].values
y = df['class'].values

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# Create feature vectors
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [111]:
# Perform classification with SVM, kernel=linear
classifier_linear = SVC(kernel='rbf', C=11.0, gamma='scale', probability=True)
t0 = time.time()
classifier_linear.fit(train_vectors, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
# report = classification_report(y_test, prediction_linear, output_dict=True)
print(classification_report(y_test, prediction_linear))
# print('positive: ', report['POS'])
# print('negative: ', report['NEG'])
classifier_linear.score(test_vectors, y_test)

Training time: 49.684609s; Prediction time: 0.943238s
              precision    recall  f1-score   support

         NEG       0.86      0.90      0.88      1240
         POS       0.88      0.83      0.85      1044

    accuracy                           0.87      2284
   macro avg       0.87      0.87      0.87      2284
weighted avg       0.87      0.87      0.87      2284



0.8686514886164624

In [121]:
review = "احب"
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict_proba(review_vector)[0][1]*100)

93.86822671912853


In [47]:
# Perform classification with SVM, kernel=linear
classifier_linear = SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, y_train)
t1 = time.time()

In [48]:
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))

Training time: 8.053040s; Prediction time: 0.637164s


In [49]:
report = classification_report(y_test, prediction_linear, output_dict=True)
print('positive: ', report['POS'])
print('negative: ', report['NEG'])

positive:  {'precision': 0.882229232386961, 'recall': 0.803639846743295, 'f1-score': 0.8411027568922306, 'support': 1044}
negative:  {'precision': 0.846211552888222, 'recall': 0.9096774193548387, 'f1-score': 0.8767975126311697, 'support': 1240}


In [50]:
classifier_linear.score(test_vectors, y_test)

0.8612084063047285

In [None]:
tf = TfidfVectorizer(max_features=2500)

In [None]:
X = tf.fit_transform(df['preprocessed_text']).toarray()
y = df['class'].values

In [None]:
modelnb = MultinomialNB()

In [None]:
modelnb.fit(X_train, y_train)

In [None]:
modelnb.score(X_test, y_test)

In [None]:
y_pred = modelnb.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
model = SVC(kernel='rbf', C=1.0, gamma='scale')

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [123]:
pickle.dump(classifier_linear, open('models/classifier.sav', 'wb'))

In [124]:
pickle.dump(vectorizer, open('models/vectorizer.sav', 'wb'))