https://www.kaggle.com/mksaad/sentiment-analysis-in-arabic-tweets-using-sklearn

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sklearn 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import random

import os


In [None]:
from database.create_sqlite_db import DimnaDatabase

In [None]:
def load(db_path):

    with DimnaDatabase(db_path) as db:
        records = db.ratings()

    num_negative = 0
    num_moderate = 0
    num_positive = 0
    comments = list()
    labels = list()
    for idx, (_,comment, rating)  in enumerate(records[2:]):
        if rating == 0:
            num_negative +=1
            comments.append(comment)
            labels.append("negative")
        elif rating == 2.5:
            comments.append(comment)
            labels.append("moderate")
            num_moderate +=1
        elif rating == 5:
            comments.append(comment)
            labels.append("positive")
            num_positive +=1

    num_total = len(comments)
    
    print(f"Number of negative data: {num_negative} [{100*num_negative/num_total:0.2f}%]")
    print(f"Number of moderate data: {num_moderate} [{100*num_moderate/num_total:0.2f}%]")      
    print(f"Number of positiv  data: {num_positive} [{100*num_positive/num_total:0.2f}%]")
    print(f"Total number of ratings: {num_total}")
    print("\n")
    x_train, x_test, y_train, y_test = train_test_split(comments, labels, test_size=0.15, random_state=42)
    
    return x_train, y_train, x_test, y_test

# define functions

In [None]:
def do_sa(n, my_classifier, name, my_data):
    x_train, y_train, x_test, y_test = my_data
    print('parameters')
    print('n grams:', n)
    print('classifier:', my_classifier.__class__.__name__)
    print('------------------------------------')

    pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=0.0001, max_df=0.95,
                                 analyzer='word', lowercase=False,
                                 ngram_range=(1, n))),
        ('clf', my_classifier),
    ])

    pipeline.fit(x_train, y_train)
    feature_names = pipeline.named_steps['vect'].get_feature_names()

    y_predicted = pipeline.predict(x_test)

    # Print the classification report
    print(metrics.classification_report(y_test, y_predicted,
                                        target_names=['negative', 'moderate', "positive"]))

    # Print the confusion matrix
    cm = metrics.confusion_matrix(y_test, y_predicted)
    print(cm)
    print('# of features:', len(feature_names))
    print('sample of features:', random.sample(feature_names, 40))
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted', zero_division=1)
    recall =  recall_score(y_test, y_predicted, average='weighted', zero_division=1)
    return name, n, accuracy, precision, recall


# Setup experiments 

In [None]:
ngrams = (1, 2, 3)
results = []
db_path = "database/dimna.db"
classifiers = [LinearSVC(),
               #SVC(),
               MultinomialNB(),
               BernoulliNB(),
               SGDClassifier(),
               DecisionTreeClassifier(max_depth=5),
               RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
               KNeighborsClassifier(3)]

dataset = load(db_path)

for g in ngrams:
    for alg in classifiers:
        alg_name = alg.__class__.__name__
        r = do_sa(g, alg, alg_name, dataset)
        results.append(r)
        

 #  Results Summary

In [None]:
print('{0:30}{1:9}{2:10}{3:11}{4:10}'.format('algorithm', 'ngram', 'accuracy', 'precision', 'recall'))
print('---------------------------------------------------------------------')
for r in results:
    print('{0:25}{1:10}{2:10.3f}{3:10.3f}{4:10.3f}'.format(r[0], r[1], r[2], r[3], r[4]))
        