In [4]:
from sklearn.metrics import classification_report, recall_score, make_scorer, f1_score
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.svm import SVC
from data import *

# Arabic
def evaluate_baseline(_set, task_sign):
    print("Building baseline for:", _set)

    train_samples = read_file(_set +".train", task_sign)
    X, y = [ x["text"] for x in train_samples ], [ x["label"] for x in train_samples ]
    
    bow = CountVectorizer(max_features=3000)
    tfidf = TfidfTransformer()

    svm_clf = SVC(C=3, gamma='scale', kernel='rbf')

    pipeline = Pipeline([('bow', bow),
                        ('tfidf', tfidf),
                        ('clf', svm_clf),])

    print('\tTraining on', len(X), 'samples')
    pipeline.fit(X, y)

    predictions = pipeline.predict(X)
    print ('-'* 40, '\nTraining data\n', classification_report(y, predictions, digits=3))

    # Testing
    print("Evaluating SVM classifier")
    test_samples = read_file(_set +".test", task_sign)
    X, y = [ x["text"] for x in test_samples ], [ x["label"] for x in test_samples ]

    predictions = pipeline.predict(X)
    print ('Test data\n', classification_report(y, predictions, digits=3))

def main():
    evaluate_baseline('ar','A')

if __name__ == "__main__":
    main()

Building baseline for: ar
	Training on 5999 samples
---------------------------------------- 
Training data
               precision    recall  f1-score   support

           0      0.995     0.999     0.997      4792
           1      0.997     0.980     0.989      1207

    accuracy                          0.995      5999
   macro avg      0.996     0.990     0.993      5999
weighted avg      0.996     0.995     0.995      5999

Evaluating SVM classifier
Test data
               precision    recall  f1-score   support

           0      0.898     0.974     0.935      1619
           1      0.828     0.530     0.646       381

    accuracy                          0.889      2000
   macro avg      0.863     0.752     0.790      2000
weighted avg      0.885     0.889     0.880      2000



In [6]:
#Danish
def evaluate_baseline(_set, task_sign):
    print("Building baseline for:", _set)

    train_samples = read_file(_set +".train", task_sign)
    X, y = [ x["text"] for x in train_samples ], [ x["label"] for x in train_samples ]
    
    bow = CountVectorizer(max_features=3000)
    tfidf = TfidfTransformer()

    svm_clf = SVC(C=5, gamma='scale', kernel='rbf')

    pipeline = Pipeline([('bow', bow),
                        ('tfidf', tfidf),
                        ('clf', svm_clf),])

    print('\tTraining on', len(X), 'samples')
    pipeline.fit(X, y)

    predictions = pipeline.predict(X)
    print ('-'* 40, '\nTraining data\n', classification_report(y, predictions, digits=3))

    # Testing
    print("Evaluating SVM classifier")
    test_samples = read_file(_set +".test", task_sign)
    X, y = [ x["text"] for x in test_samples ], [ x["label"] for x in test_samples ]

    predictions = pipeline.predict(X)
    print ('Test data\n', classification_report(y, predictions, digits=3))

def main():
    evaluate_baseline('da','A')

if __name__ == "__main__":
    main()

Building baseline for: da
	Training on 2199 samples
---------------------------------------- 
Training data
               precision    recall  f1-score   support

           0      0.995     1.000     0.998      1916
           1      1.000     0.968     0.984       283

    accuracy                          0.996      2199
   macro avg      0.998     0.984     0.991      2199
weighted avg      0.996     0.996     0.996      2199

Evaluating SVM classifier
Test data
               precision    recall  f1-score   support

           0      0.899     0.991     0.943       659
           1      0.824     0.277     0.415       101

    accuracy                          0.896       760
   macro avg      0.861     0.634     0.679       760
weighted avg      0.889     0.896     0.873       760



In [7]:
#Turkish
def evaluate_baseline(_set, task_sign):
    print("Building baseline for:", _set)

    train_samples = read_file(_set +".train", task_sign)
    X, y = [ x["text"] for x in train_samples ], [ x["label"] for x in train_samples ]
    
    bow = CountVectorizer(max_features=3000)
    tfidf = TfidfTransformer()

    svm_clf = SVC(C=5, gamma='scale', kernel='rbf')

    pipeline = Pipeline([('bow', bow),
                        ('tfidf', tfidf),
                        ('clf', svm_clf),])

    print('\tTraining on', len(X), 'samples')
    pipeline.fit(X, y)

    predictions = pipeline.predict(X)
    print ('-'* 40, '\nTraining data\n', classification_report(y, predictions, digits=3))

    # Testing
    print("Evaluating SVM classifier")
    test_samples = read_file(_set +".test", task_sign)
    X, y = [ x["text"] for x in test_samples ], [ x["label"] for x in test_samples ]

    predictions = pipeline.predict(X)
    print ('Test data\n', classification_report(y, predictions, digits=3))

def main():
    evaluate_baseline('tr','A')

if __name__ == "__main__":
    main()

Building baseline for: tr
	Training on 25999 samples
---------------------------------------- 
Training data
               precision    recall  f1-score   support

           0      0.998     1.000     0.999     20973
           1      0.999     0.994     0.997      5026

    accuracy                          0.999     25999
   macro avg      0.999     0.997     0.998     25999
weighted avg      0.999     0.999     0.999     25999

Evaluating SVM classifier
Test data
               precision    recall  f1-score   support

           0      0.857     0.971     0.911      4651
           1      0.725     0.320     0.444      1105

    accuracy                          0.846      5756
   macro avg      0.791     0.646     0.678      5756
weighted avg      0.832     0.846     0.821      5756

