In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import itertools
import csv
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.model_selection import KFold

In [2]:
files = ['HTL.csv']

In [3]:
def prepareDatasetes(files):
    sentences = []
    
    for file_name in files:
        file = open(file_name,"rt", encoding="utf-8")
        reviews = csv.reader(file)
        for text,polarity in reviews:
            sentences.append([text,polarity])
        
        df = DataFrame(sentences, columns = ['text','polarity'])
        file.close()
        return df

In [4]:
reviews = prepareDatasetes(files)

In [5]:
import mysql.connector
from mysql.connector import errorcode, connect

In [6]:
def connectMysql():
    try:
        cnn = connect(
        user ='root',
            password='',
            host='localhost',
            database='test')
        return cnn
    except mysql.connector.Error as e:
        if e.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print ("Access denied")
        elif e.errno == errorcode.ER_BAD_DB_ERROR:
            print ("Database doesn't exist")
        else:
            print [e]

In [7]:
def createTable(sql):
    try:
        cnn = connectMysql()
        cursor = cnn.cursor()
        cursor.execute(sql)
    except:
        cnn.rollback()
    cnn.close()  

In [8]:
sql = "DROP TABLE IF EXISTS REVIEWS; CREATE TABLE REVIEWS (TEXT TEXT NOT NULL, POLARITY INT(10))"
createTable(sql)

In [9]:
def insertData(text, polarity):
    try:
        cnn = connectMysql()
        cursor = cnn.cursor()
        cursor.execute("INSERT INTO `REVIEWS` VALUES ('%s', '%s')" % (text, polarity))
        cursor.commit()
    except:
        cnn.rollback()
    cnn.close()

In [10]:
#reviews

In [11]:
for index, (text, polarity) in reviews.iterrows():
    insertData(text, polarity)

In [12]:
def getData(sql):
    cnn = connectMysql()
    cursor = cnn.cursor()
    cursor.execute(sql)
    reviews = DataFrame(cursor.fetchall())
    reviews.rename(columns={0:'text', 1:'label'}, inplace=True)
    cursor.close()
    cnn.close()
    return reviews

In [13]:
sql = "SELECT * FROM `REVIEWS`"
reviews = getData(sql)
len(reviews)

13358

In [14]:
reviews

Unnamed: 0,text,label
0,المكان الذي يمكنك فيه مراجعة الذات والتفكر هو ...,1
1,أسوأ فندق أقمت فيه على الإطلاق \nيستغرق تسجيل...,-1
2,فندق جميل مع سوء الإدارة والخدمات. \nمن الخار...,-1
3,فندق جميل، منظر رائع من بركة السباحة (على السط...,1
4,مكان لطيف وهادئ بعيدًا عن الزحام \nعثرت على ه...,1
5,"راقي و كلاسيكي"" \nفندق كبير مصمم بفن الديكور ...",1
6,تحذير للرجال الذي يحبون تناول بعض البيرة ليلاً...,1
7,فندق رائع \nبعد 4 أيام قضيناها في ليسبوا، وصل...,1
8,ساحر \nإذا كنت تتوق إلى تجربة فريدة، عليك بتج...,1
9,هناك بعض الإحباطات ولكن الغرف كانت رائعة والمو...,1


In [15]:
def normalize(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("[a-zA-Z0-9_]",'',text)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    return(text)

In [16]:
normalize("9 hh بسم الله الرحمن الرحيم")

'  بسم الله الرحمن الرحيم'

In [17]:
def stopWordRomve(text):
    ar_stop_list = open("list.txt","r",encoding="utf8")
    stop_words = ar_stop_list.read().split('\n')
    needed_words = []
    words = word_tokenize(text)
    for w in words:
        if w not in (stop_words):
            needed_words.append(w)
    filtered_sentence = " ".join(needed_words)
    ar_stop_list.close()
    return filtered_sentence

In [18]:
stopWordRomve("بسم الله الرحمن الرحيم")

'بسم الله الرحمن الرحيم'

In [19]:
def stemming(text):
    st = ISRIStemmer()
    stemmed_words = []
    words = word_tokenize(text)
    for w in words:
        stemmed_words.append(st.stem(w))
    stemmed_sentence = " ".join(stemmed_words)
    return stemmed_sentence

In [20]:
print (stemming("بسم الله الرحمن الرحيم"))

بسم الل رحم رحم


In [21]:
def prepareDataSets(reviews):
    sentences = []
    for index, r in reviews.iterrows():
        text = stopWordRomve(r['text'])
        text = normalize(r['text'])
        text = stemming(r['text'])
        if r['label'] == -1:
            sentences.append([text,'neg'])
        elif r['label'] == 1:
            sentences.append([text,'pos'])
        
    def_sentences = DataFrame(sentences, columns=['text','label'])
    return def_sentences

In [22]:
preprocessed_reviews = prepareDataSets(reviews)
preprocessed_reviews

Unnamed: 0,text,label
0,كان الذي يمك فيه رجع ذات فكر هو كوكروبيت، غنا ...,pos
1,اسأ ندق اقم فيه على طلق غرق سجل وصل حلي 30 دقي...,neg
2,ندق جمل مع سوء درة خدم . من خرج بدا مبشرا، لكن...,neg
3,ندق جميل، نظر رئع من برك سبح ( على سطح ) ! ! !...,pos
4,كان لطف هدئ بعد عن زحم عثر على هذا كان عن طرق ...,pos
5,رقي و كلس '' ندق كبر صمم بفن ديكور جمل ارت ديك...,pos
6,حذر رجل الذي يحب نول بعض بير ليل ! شيء وحد اود...,pos
7,ندق رئع بعد 4 ايم قضن في بو، وصل اخر الى نزل ....,pos
8,سحر اذا كنت تتق الى جرب ريدة، علك جرب ندق است ...,pos
9,هناك بعض حبط ولكن غرف كانت رئع وقع لا يمكن نفس...,pos


In [23]:
def featureExtraction(data):
    vectorizer = TfidfVectorizer(min_df=10, max_df=0.75, ngram_range=(1,3))
    tfidf_data = vectorizer.fit_transform(data)
    return tfidf_data

In [24]:
def learning(clf, x, y):
      
    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=.2, random_state=42)
   
    classifer = clf()
    classifer.fit(x_train, y_train)
    predict = sklearn.model_selection.cross_val_predict(classifer, x_test, y_test, cv=10)
    
    scores = sklearn.model_selection.cross_val_score(classifer, x_test, y_test, cv=10)
    print (scores)
    print ("Accurecy of %s: %0.2f (+/- %0.2f)" % (classifer, scores.mean(), scores.std() *2))
    print (classification_report(y_test, predict))
    

In [25]:
def main(clf):
    reviews = getData("SELECT * FROM `REVIEWS`")
    preprocessed_reviews = prepareDataSets(reviews)
    data, target = preprocessed_reviews['text'], preprocessed_reviews['label']
    tfidf_data = featureExtraction(data)
    learning(clf,tfidf_data, target)
    

In [26]:
#clfs = [MultinomialNB, BernoulliNB, LogisticRegression, SGDClassifier, SVC, LinearSVC, DecisionTreeClassifier, RandomForestClassifier]

In [27]:
clfs = [MultinomialNB]

In [28]:
import warnings
warnings.filterwarnings('ignore', message='the matrix subclass is not the recommended way')

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)


In [29]:
for clf in clfs:
    main(clf)

[0.80597015 0.80597015 0.80597015 0.80898876 0.80898876 0.80898876
 0.8164794  0.80898876 0.80898876 0.81203008]
Accurecy of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True): 0.81 (+/- 0.01)
              precision    recall  f1-score   support

         neg       1.00      0.01      0.01       513
         pos       0.81      1.00      0.89      2159

   micro avg       0.81      0.81      0.81      2672
   macro avg       0.90      0.50      0.45      2672
weighted avg       0.85      0.81      0.72      2672



In [30]:
clfs = [BernoulliNB]
for clf in clfs:
    main(clf)

[0.86567164 0.85820896 0.87313433 0.83146067 0.82397004 0.8576779
 0.86516854 0.85018727 0.83146067 0.84586466]
Accurecy of BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True): 0.85 (+/- 0.03)
              precision    recall  f1-score   support

         neg       0.63      0.52      0.57       513
         pos       0.89      0.93      0.91      2159

   micro avg       0.85      0.85      0.85      2672
   macro avg       0.76      0.73      0.74      2672
weighted avg       0.84      0.85      0.84      2672



In [31]:
clfs = [LogisticRegression]
for clf in clfs:
    main(clf)

[0.87313433 0.86940299 0.87686567 0.87265918 0.88014981 0.88014981
 0.8988764  0.90262172 0.8576779  0.86090226]
Accurecy of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False): 0.88 (+/- 0.03)
              precision    recall  f1-score   support

         neg       0.99      0.36      0.53       513
         pos       0.87      1.00      0.93      2159

   micro avg       0.88      0.88      0.88      2672
   macro avg       0.93      0.68      0.73      2672
weighted avg       0.89      0.88      0.85      2672



In [32]:
clfs = [SGDClassifier]
for clf in clfs:
    main(clf)

[0.95522388 0.92537313 0.95149254 0.93258427 0.93258427 0.9588015
 0.94756554 0.94007491 0.92509363 0.94736842]
Accurecy of SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False): 0.94 (+/- 0.02)
              precision    recall  f1-score   support

         neg       0.91      0.76      0.83       513
         pos       0.94      0.98      0.96      2159

   micro avg       0.94      0.94      0.94      2672
   macro avg       0.93      0.87      0.90      2672
weighted avg       0.94      0.94      0.94      2672



In [33]:
clfs = [SVC]
for clf in clfs:
    main(clf)

[0.80597015 0.80597015 0.80597015 0.80898876 0.80898876 0.80898876
 0.80898876 0.80898876 0.80898876 0.80827068]
Accurecy of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False): 0.81 (+/- 0.00)
              precision    recall  f1-score   support

         neg       0.00      0.00      0.00       513
         pos       0.81      1.00      0.89      2159

   micro avg       0.81      0.81      0.81      2672
   macro avg       0.40      0.50      0.45      2672
weighted avg       0.65      0.81      0.72      2672



  'precision', 'predicted', average, warn_for)


In [34]:
clfs = [LinearSVC]
for clf in clfs:
    main(clf)

[0.94029851 0.92910448 0.95522388 0.94007491 0.92883895 0.95505618
 0.94382022 0.93632959 0.93258427 0.94360902]
Accurecy of LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0): 0.94 (+/- 0.02)
              precision    recall  f1-score   support

         neg       0.93      0.74      0.83       513
         pos       0.94      0.99      0.96      2159

   micro avg       0.94      0.94      0.94      2672
   macro avg       0.94      0.87      0.90      2672
weighted avg       0.94      0.94      0.94      2672



In [35]:
clfs = [DecisionTreeClassifier]
for clf in clfs:
    main(clf)

[0.83208955 0.87313433 0.84701493 0.86142322 0.87265918 0.88764045
 0.84644195 0.8576779  0.85393258 0.85338346]
Accurecy of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'): 0.86 (+/- 0.03)
              precision    recall  f1-score   support

         neg       0.65      0.63      0.64       513
         pos       0.91      0.92      0.92      2159

   micro avg       0.86      0.86      0.86      2672
   macro avg       0.78      0.78      0.78      2672
weighted avg       0.86      0.86      0.86      2672



In [36]:
clfs = [RandomForestClassifier]
for clf in clfs:
    main(clf)

[0.89179104 0.87313433 0.87313433 0.88764045 0.89513109 0.88764045
 0.89513109 0.87640449 0.88389513 0.88721805]
Accurecy of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False): 0.89 (+/- 0.02)
              precision    recall  f1-score   support

         neg       0.89      0.49      0.63       513
         pos       0.89      0.99      0.94      2159

   micro avg       0.89      0.89      0.89      2672
   macro avg       0.89      0.74      0.78      2672
weighted avg       0.89      0.89      0.88      2672

