In [21]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from openpyxl import Workbook
from openpyxl import load_workbook
import re
from itertools import chain
from sklearn.utils import shuffle
import nltk  
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import string
from nltk.stem.snowball import SnowballStemmer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline, FeatureUnion

In [7]:
def load_romanian_data():
    wb = load_workbook('../articole.xlsx')
    sheet = wb['Foaie1']
    data_pos=[]
    data_neg=[]
    data_doubt=[]
    value = ''
    for i in range(1, len(sheet['B'])):
        value = str(sheet['B' + str(i)].value).replace('•', '') # remove special characters
        if bool(re.match('^(?=.*[a-zA-Z])', str(sheet['B' + str(i)].value))): # check if string not empty and has letters
            if sheet['A' + str(i)].value == 1:
                data_pos.append(value)
            elif sheet['A' + str(i)].value == 0:
                data_doubt.append(value)
            elif not sheet['A' + str(i)].value:
                data_neg.append(value)
    print("data pos len: " + str(len(data_pos)))
    print( "data neg len: " + str(len(data_neg)))
    
#     low_limit = 3000
#     high_limit = 10000
#     test_pos = data_pos[:low_limit]
#     train_pos = data_pos[low_limit:high_limit]
#     test_neg = data_neg[:low_limit]
    limit = 2400
    test_pos = data_pos[:limit]
    test_neg = data_neg[:limit]
#     train_pos = data_pos[limit:]
#     train_neg = data_neg[limit:len(data_pos)] # we make sure len of neg data = len of pos data(len neg > len pos initially)

#     train_pos = [str(w).lower() for w in train_pos]
#     train_pos = ([" ".join(j for j in w.split() if len(j) >= 2) for w in train_pos])

#     train_neg = [str(w).lower() for w in train_neg]
#     train_neg = ([" ".join(j for j in w.split() if len(j) >= 2) for w in train_neg])

    with open("data/lemma/train_pos_lemma.txt") as pos_lemma:
        train_pos_lemma = pos_lemma.readlines()
    train_pos_lemma = ([" ".join(j for j in w.split() if len(j) >= 2) for w in train_pos_lemma])
    with open("data/lemma/train_neg_lemma.txt") as neg_lemma:
        train_neg_lemma = neg_lemma.readlines()
    train_neg_lemma = ([" ".join(j for j in w.split() if len(j) >= 2) for w in train_neg_lemma])
    
    test_pos = [str(w).lower() for w in test_pos]
    test_neg = [str(w).lower() for w in test_neg]
    
    return train_pos_lemma, train_neg_lemma, test_pos, test_neg

In [9]:
def prepare_train_and_test(train_pos, train_neg, test_pos, test_neg):
    X_train = list(chain(train_pos, train_neg))
    y_train = np.concatenate((np.ones(len(train_pos), int), np.zeros(len(train_neg), int)))

    X_test = list(chain(test_pos, test_neg))
    y_test = np.concatenate((np.ones(len(test_pos), int), np.zeros(len(test_neg), int)))

    X_train_shuffled, y_train_shuffled =  shuffle(X_train, y_train)
    X_test_shuffled, y_test_shuffled = shuffle(X_test, y_test)

    return X_train_shuffled, y_train_shuffled, X_test_shuffled, y_test_shuffled

In [12]:
def remove_stopwords(sentences_list, updated_stopwords):
    filtered_sentence = []
    for sentence in sentences_list:
        filtered_sentence.append([w for w in sentence if not w in updated_stopwords])
    return repair_sentence(filtered_sentence)


def remove_punctuation(from_train_data):
    table = str.maketrans('', '', string.punctuation)
    no_punctuation = [w.translate(table) for w in from_train_data]
    numbers = re.compile('[0-9]')
    plain_text = [numbers.sub("", word) for word in no_punctuation]
    return plain_text
    

def remove_spaces(from_train_data):
    clean_spaces_data = []
    for sentence in from_train_data:
        clean_spaces_data.append(re.sub(' +', ' ', sentence).rstrip().lstrip())
    return clean_spaces_data

def repair_sentence(sentence_list):
    return [' '.join(map(str, element)) for element in sentence_list]


def update_stopwords(stopwords):
    do_no_remove_these_sw = ['not', 'no', 'can','has','have','had','must','shan','do', 'should','was','were','won',
                             'are','cannot','does','ain', 'could', 'did', 'is', 'might', 'need', 'would']
    return [word for word in stopwords if word not in do_no_remove_these_sw]


def stem_words(from_text):
    stemmer = SnowballStemmer("romanian")
    stemmer2 = SnowballStemmer("romanian", ignore_stopwords=True)
    return [" ".join([stemmer.stem(word) for word in sentence.split(" ")]) for sentence in from_text]


def lemmatize_words(sentence):
    from pywsd.utils import lemmatize_sentence
    return lemmatize_sentence(sentence)


def sentence_tokenization(sentence):
    return word_tokenize(sentence)


def sentence_punct_tokenization(sentence):
    return WordPunctTokenizer().tokenize(sentence)


def sentence_split_tokenization(sentence):
    return ([i for i in re.split(' ', sentence) if i])


def remove_apostrophe_words(train):
    train = [w.replace("it's", 'it is')
                     .replace("that's", "that is")
                     .replace("it 's", 'it is')
                     .replace("that 's", "that is")
                     .replace("'ve", " have")
                     .replace("' ve", " have")
                     .replace("won't", "will not")
                     .replace("wo n't", "will not")
                     .replace("don't", "do not")
                     .replace("do n't", "do not")
                     .replace("can't", "can not")
                     .replace("ca n't", "can not")
                     .replace("sha n't", "shall not")
                     .replace("shan't", "shall not")
                     .replace("n't", " not")
                     .replace("'re", " are")
                     .replace("'d", " would")
                     .replace("'ll", " will") for w in train]
    return train

def remove_empty_sentences(X, y):
    new_X = []
    new_y = []
    for i in range(len(X)):
        if len(X[i].split()) > 1:
            new_X.append(X[i])
            new_y.append(y[i])
    return new_X, new_y

In [13]:
def feature(X_train):
    stopwords = set(nltk.corpus.stopwords.words('romanian'))
    
    tokenized_sentence = []
    for sentence in X_train:
        tokenized_sentence.append(sentence_punct_tokenization(sentence))

    # NO STOP WORDS
    train_without_stopwords = remove_stopwords(tokenized_sentence, stopwords)

    # NO PUNCTUATION
    train_without_punctuation = remove_punctuation(train_without_stopwords)
    train_clean_spaces = remove_spaces(train_without_punctuation)
    return train_clean_spaces # we choose to extract stop words

# Main

In [14]:
train_pos, train_neg, test_pos, test_neg = load_romanian_data()

data pos len: 11077
data neg len: 23509


In [15]:
X_train, y_train, X_test, y_test = prepare_train_and_test(train_pos, train_neg, test_pos, test_neg)

In [22]:
PRESETS_DICTIONARY = {'PAN18_English': {'dataset_name': 'PAN 2018 English',
                                            'word_ngram_range': (1, 3),
                                            'perform_dimentionality_reduction': True,
                                            },
                          'PAN18_Spanish': {'dataset_name': 'PAN 2018 Spanish',
                                            'word_ngram_range': (1, 2),
                                            'perform_dimentionality_reduction': False,
                                            },
                          'PAN18_Arabic': {'dataset_name': 'PAN 2018 Arabic',
                                            'word_ngram_range': (1, 2),
                                            'perform_dimentionality_reduction': False,
                                            },
                          }
PRESET = PRESETS_DICTIONARY["PAN18_English"]


word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=PRESET['word_ngram_range'],
                                  min_df=2, use_idf=True, sublinear_tf=True)
# Build a vectorizer that splits strings into sequences of 3 to 5 characters
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5),
                                 min_df=2, use_idf=True, sublinear_tf=True)
# %% Trying out count vectorizer
# vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', min_df=1)

# Build a transformer (vectorizer) pipeline using the previous analyzers
# *FeatureUnion* concatenates results of multiple transformer objects
ngrams_vectorizer = Pipeline([('feats', FeatureUnion([('word_ngram', word_vectorizer),
                                                     ('char_ngram', char_vectorizer),
                                                     ])),
                             # ('clff', LinearSVC(random_state=42))
                             ])

# Fit (learn vocabulary and IDF) and transform (transform documents to the TF-IDF matrix) the training set
X_train_ngrams_tfidf = ngrams_vectorizer.fit_transform(X_train)

In [24]:
feature_names_ngrams = [word_vectorizer.vocabulary_, char_vectorizer.vocabulary_]
feature_names_ngrams

[{'cel': 12977,
  'mai': 41546,
  'probabil': 53732,
  'caracalean': 11359,
  'intenţiona': 37311,
  'să': 62612,
  'face': 30794,
  'stângă': 61936,
  'însă': 75293,
  'din': 25609,
  'cauză': 12462,
  'că': 18129,
  'sens': 58553,
  'giratoriu': 35210,
  'fi': 31714,
  'în': 72784,
  'formă': 34303,
  'de': 20419,
  'cerc': 13178,
  'manevră': 42508,
  'dorit': 27236,
  'acesta': 1267,
  'practic': 52691,
  'imposibil': 36820,
  'cel mai': 13006,
  'mai probabil': 42125,
  'intenţiona să': 37312,
  'să face': 63161,
  'însă din': 75328,
  'din cauză': 25803,
  'cauză că': 12474,
  'sens giratoriu': 58557,
  'fi în': 33571,
  'în formă': 73473,
  'formă de': 34309,
  'de cerc': 21134,
  'de acesta': 20580,
  'acesta fi': 1365,
  'fi practic': 32946,
  'practic imposibil': 52708,
  'cel mai probabil': 13055,
  'însă din cauză': 75329,
  'din cauză că': 25811,
  'fi în formă': 33577,
  'în formă de': 73474,
  'de acesta fi': 20581,
  'fi practic imposibil': 32947,
  'deţinător': 25454,


In [26]:
X_test_ngrams_tfidf = ngrams_vectorizer.transform(X_test)

In [27]:
svd = TruncatedSVD(n_components=300, random_state=43)
# Fit the LSI model and perform dimensionality reduction
X_train_ngrams_tfidf_reduced = svd.fit_transform(X_train_ngrams_tfidf)
X_test_ngrams_tfidf_reduced = svd.transform(X_test_ngrams_tfidf)

In [28]:
# 4. Linear Support Vector Machine
svm_clf = LinearSVC(loss = 'hinge', penalty = 'l2', tol = 1e-4, max_iter = 1000)
svm_clf.fit(X_train_ngrams_tfidf_reduced, y_train)
predicted_test = svm_clf.predict(X_test_ngrams_tfidf_reduced)
predicted_train = svm_clf.predict(X_train_ngrams_tfidf_reduced)
print('========== 4. Support Vector Machine with Linear Kernel ==========')
print('The F-1 score for test query is ' + str(metrics.f1_score(y_test, predicted_test, average = 'macro')))
print('Training accuracy of SVM model is ' + str(np.mean(predicted_train == y_train)))
print('Test accuracy of SVM model is ' + str(np.mean(predicted_test == y_test)))
print('')

The F-1 score for test query is 0.6941760948348572
Training accuracy of SVM model is 0.6944032827994985
Test accuracy of SVM model is 0.6970833333333334





In [30]:
# 3. Random Forest.
rf_clf = RandomForestClassifier(n_estimators = 200, max_depth = 100)
rf_clf.fit(X_train_ngrams_tfidf_reduced, y_train)
predicted_test = rf_clf.predict(X_test_ngrams_tfidf_reduced)
predicted_train = rf_clf.predict(X_train_ngrams_tfidf_reduced)
print('========== 3. Random Forest ==========')
print('The F-1 score for test query is ' + str(metrics.f1_score(y_test, predicted_test, average = 'macro')))
print('Training accuracy of random forest model is ' + str(np.mean(predicted_train == y_train)))
print('Test accuracy of random forest model is ' + str(np.mean(predicted_test == y_test)))
print('')

The F-1 score for test query is 0.6894185112212023
Training accuracy of random forest model is 0.999601048672062
Test accuracy of random forest model is 0.6914583333333333

