In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import itertools
import csv
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.model_selection import KFold

In [2]:
files = ['HTL2.csv']

In [3]:
def prepareDatasetes(files):
    sentences = []
    
    for file_name in files:
        file = open(file_name,"rt", encoding="utf-8")
        reviews = csv.reader(file)
        for text,polarity in reviews:
            sentences.append([text,polarity])
        
        df = DataFrame(sentences, columns = ['text','polarity'])
        file.close()
        return df

In [4]:
reviews = prepareDatasetes(files)

In [5]:
import mysql.connector
from mysql.connector import errorcode, connect

In [6]:
def connectMysql():
    try:
        cnn = connect(
        user ='root',
            password='',
            host='localhost',
            database='test1')
        return cnn
    except mysql.connector.Error as e:
        if e.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print ("Access denied")
        elif e.errno == errorcode.ER_BAD_DB_ERROR:
            print ("Database doesn't exist")
        else:
            print [e]

In [7]:
def createTable(sql):
    try:
        cnn = connectMysql()
        cursor = cnn.cursor()
        cursor.execute(sql)
    except:
        cnn.rollback()
    cnn.close()  

In [8]:
sql = "DROP TABLE IF EXISTS REVIEWS; CREATE TABLE REVIEWS (TEXT TEXT NOT NULL, POLARITY INT(10))"
createTable(sql)

In [9]:
def insertData(text, polarity):
    try:
        cnn = connectMysql()
        cursor = cnn.cursor()
        cursor.execute("INSERT INTO `REVIEWS` VALUES ('%s', '%s')" % (text, polarity))
        cursor.commit()
    except:
        cnn.rollback()
    cnn.close()

In [10]:
#reviews

In [11]:
for index, (text, polarity) in reviews.iterrows():
    insertData(text, polarity)

In [12]:
def getData(sql):
    cnn = connectMysql()
    cursor = cnn.cursor()
    cursor.execute(sql)
    reviews = DataFrame(cursor.fetchall())
    reviews.rename(columns={0:'text', 1:'label'}, inplace=True)
    cursor.close()
    cnn.close()
    return reviews

In [13]:
sql = "SELECT * FROM `REVIEWS`"
reviews = getData(sql)
len(reviews)

15500

In [14]:
reviews

Unnamed: 0,text,label
0,المكان الذي يمكنك فيه مراجعة الذات والتفكر هو ...,1
1,موقع رائع وحديقة رائعة ويستحق نجمةّ إضافية \n...,0
2,أسوأ فندق أقمت فيه على الإطلاق \nيستغرق تسجيل...,-1
3,بدون روح كأنه فندق ثلاثة نجوم \nبدون إدارة اح...,0
4,فندق جميل مع سوء الإدارة والخدمات. \nمن الخار...,-1
5,فندق جميل، منظر رائع من بركة السباحة (على السط...,1
6,مكان لطيف وهادئ بعيدًا عن الزحام \nعثرت على ه...,1
7,"راقي و كلاسيكي"" \nفندق كبير مصمم بفن الديكور ...",1
8,تحذير للرجال الذي يحبون تناول بعض البيرة ليلاً...,1
9,فندق رائع \nبعد 4 أيام قضيناها في ليسبوا، وصل...,1


In [15]:
def normalize(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("[a-zA-Z0-9_]",'',text)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    return(text)

In [16]:
normalize("9 hh بسم الله الرحمن الرحيم")

'  بسم الله الرحمن الرحيم'

In [17]:
def stopWordRomve(text):
    ar_stop_list = open("list.txt","r",encoding="utf8")
    stop_words = ar_stop_list.read().split('\n')
    needed_words = []
    words = word_tokenize(text)
    for w in words:
        if w not in (stop_words):
            needed_words.append(w)
    filtered_sentence = " ".join(needed_words)
    ar_stop_list.close()
    return filtered_sentence

In [18]:
stopWordRomve("بسم الله الرحمن الرحيم")

'بسم الله الرحمن الرحيم'

In [19]:
def stemming(text):
    st = ISRIStemmer()
    stemmed_words = []
    words = word_tokenize(text)
    for w in words:
        stemmed_words.append(st.stem(w))
    stemmed_sentence = " ".join(stemmed_words)
    return stemmed_sentence

In [20]:
print (stemming("بسم الله الرحمن الرحيم"))

بسم الل رحم رحم


In [21]:
def prepareDataSets(reviews):
    sentences = []
    for index, r in reviews.iterrows():
        text = stopWordRomve(r['text'])
        text = normalize(r['text'])
        text = stemming(r['text'])
        if r['label'] == -1:
            sentences.append([text,'neg'])
        elif r['label'] == 1:
            sentences.append([text,'pos'])
        elif r['label'] == 0:
            sentences.append([text,'nat'])
        
    def_sentences = DataFrame(sentences, columns=['text','label'])
    return def_sentences

In [22]:
preprocessed_reviews = prepareDataSets(reviews)
preprocessed_reviews

Unnamed: 0,text,label
0,كان الذي يمك فيه رجع ذات فكر هو كوكروبيت، غنا ...,pos
1,وقع رئع حدق رئع سحق نجم ضفي على رغم من ان غرف ...,nat
2,اسأ ندق اقم فيه على طلق غرق سجل وصل حلي 30 دقي...,neg
3,بدن روح كأن ندق ثلث نجم بدن درة احترافية، فإن ...,nat
4,ندق جمل مع سوء درة خدم . من خرج بدا مبشرا، لكن...,neg
5,ندق جميل، نظر رئع من برك سبح ( على سطح ) ! ! !...,pos
6,كان لطف هدئ بعد عن زحم عثر على هذا كان عن طرق ...,pos
7,رقي و كلس '' ندق كبر صمم بفن ديكور جمل ارت ديك...,pos
8,حذر رجل الذي يحب نول بعض بير ليل ! شيء وحد اود...,pos
9,ندق رئع بعد 4 ايم قضن في بو، وصل اخر الى نزل ....,pos


In [23]:
def featureExtraction(data):
    vectorizer = TfidfVectorizer(min_df=10, max_df=0.75, ngram_range=(1,3))
    tfidf_data = vectorizer.fit_transform(data)
    return tfidf_data

In [24]:
def learning(clf, x, y):
      
    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=.2, random_state=42)
   
    classifer = clf()
    classifer.fit(x_train, y_train)
    predict = sklearn.model_selection.cross_val_predict(classifer, x_test, y_test, cv=10)
    
    scores = sklearn.model_selection.cross_val_score(classifer, x_test, y_test, cv=10)
    print (scores)
    print ("Accurecy of %s: %0.2f (+/- %0.2f)" % (classifer, scores.mean(), scores.std() *2))
    print (classification_report(y_test, predict))
    

In [25]:
def main(clf):
    reviews = getData("SELECT * FROM `REVIEWS`")
    preprocessed_reviews = prepareDataSets(reviews)
    data, target = preprocessed_reviews['text'], preprocessed_reviews['label']
    tfidf_data = featureExtraction(data)
    learning(clf,tfidf_data, target)
    

In [26]:
#clfs = [MultinomialNB, BernoulliNB, LogisticRegression, SGDClassifier, SVC, LinearSVC, DecisionTreeClassifier, RandomForestClassifier]

In [27]:
clfs = [MultinomialNB]

In [28]:
import warnings
warnings.filterwarnings('ignore', message='the matrix subclass is not the recommended way')

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)


In [29]:
for clf in clfs:
    main(clf)

[0.70096463 0.70096463 0.70096463 0.70096463 0.70096463 0.7
 0.70226537 0.70550162 0.70226537 0.70454545]
Accurecy of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True): 0.70 (+/- 0.00)
              precision    recall  f1-score   support

         nat       0.00      0.00      0.00       439
         neg       1.00      0.00      0.00       486
         pos       0.70      1.00      0.82      2175

   micro avg       0.70      0.70      0.70      3100
   macro avg       0.57      0.33      0.28      3100
weighted avg       0.65      0.70      0.58      3100



  'precision', 'predicted', average, warn_for)


In [30]:
clfs = [BernoulliNB]
for clf in clfs:
    main(clf)

[0.7073955  0.78456592 0.72025723 0.7266881  0.68810289 0.73870968
 0.7184466  0.72491909 0.75728155 0.74350649]
Accurecy of BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True): 0.73 (+/- 0.05)
              precision    recall  f1-score   support

         nat       0.31      0.18      0.23       439
         neg       0.69      0.46      0.55       486
         pos       0.78      0.90      0.84      2175

   micro avg       0.73      0.73      0.73      3100
   macro avg       0.59      0.51      0.54      3100
weighted avg       0.70      0.73      0.71      3100



In [31]:
clfs = [LogisticRegression]
for clf in clfs:
    main(clf)

[0.77813505 0.76848875 0.75884244 0.76527331 0.78778135 0.79354839
 0.76375405 0.77022654 0.77346278 0.78246753]
Accurecy of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False): 0.77 (+/- 0.02)
              precision    recall  f1-score   support

         nat       0.70      0.04      0.07       439
         neg       0.87      0.44      0.58       486
         pos       0.77      1.00      0.87      2175

   micro avg       0.77      0.77      0.77      3100
   macro avg       0.78      0.49      0.51      3100
weighted avg       0.77      0.77      0.71      3100



In [32]:
clfs = [SGDClassifier]
for clf in clfs:
    main(clf)

[0.83279743 0.78456592 0.81993569 0.81672026 0.81350482 0.80322581
 0.80582524 0.8381877  0.82847896 0.8474026 ]
Accurecy of SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False): 0.82 (+/- 0.04)
              precision    recall  f1-score   support

         nat       0.46      0.25      0.32       439
         neg       0.72      0.69      0.71       486
         pos       0.87      0.96      0.91      2175

   micro avg       0.82      0.82      0.82      3100
   macro avg       0.68      0.63      0.65      3100
weighted avg       0.79      0.82      0.80      3100



In [33]:
clfs = [SVC]
for clf in clfs:
    main(clf)

[0.70096463 0.70096463 0.70096463 0.70096463 0.70096463 0.7
 0.70226537 0.70226537 0.70226537 0.70454545]
Accurecy of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False): 0.70 (+/- 0.00)
              precision    recall  f1-score   support

         nat       0.00      0.00      0.00       439
         neg       0.00      0.00      0.00       486
         pos       0.70      1.00      0.82      2175

   micro avg       0.70      0.70      0.70      3100
   macro avg       0.23      0.33      0.27      3100
weighted avg       0.49      0.70      0.58      3100



  'precision', 'predicted', average, warn_for)


In [34]:
clfs = [LinearSVC]
for clf in clfs:
    main(clf)

[0.83601286 0.79742765 0.82636656 0.82315113 0.81350482 0.82258065
 0.81229773 0.83171521 0.82200647 0.83441558]
Accurecy of LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0): 0.82 (+/- 0.02)
              precision    recall  f1-score   support

         nat       0.52      0.19      0.28       439
         neg       0.76      0.70      0.72       486
         pos       0.85      0.98      0.91      2175

   micro avg       0.82      0.82      0.82      3100
   macro avg       0.71      0.62      0.64      3100
weighted avg       0.79      0.82      0.79      3100



In [35]:
clfs = [DecisionTreeClassifier]
for clf in clfs:
    main(clf)

[0.71382637 0.7170418  0.66881029 0.7266881  0.67202572 0.73548387
 0.70550162 0.71197411 0.67961165 0.7012987 ]
Accurecy of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'): 0.70 (+/- 0.04)
              precision    recall  f1-score   support

         nat       0.24      0.22      0.23       439
         neg       0.53      0.54      0.54       486
         pos       0.83      0.84      0.83      2175

   micro avg       0.71      0.71      0.71      3100
   macro avg       0.53      0.53      0.53      3100
weighted avg       0.70      0.71      0.70      3100



In [36]:
clfs = [RandomForestClassifier]
for clf in clfs:
    main(clf)

[0.76848875 0.74919614 0.77813505 0.75884244 0.74919614 0.77741935
 0.76375405 0.74757282 0.75404531 0.76623377]
Accurecy of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False): 0.76 (+/- 0.02)
              precision    recall  f1-score   support

         nat       0.36      0.07      0.12       439
         neg       0.67      0.38      0.49       486
         pos       0.78      0.98      0.86      2175

   micro avg       0.75      0.75      0.75      3100
   macro avg       0.60      0.48      0.49      3100
weighted avg       0.70      0.75      0.70      3100

