In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from openpyxl import Workbook
from openpyxl import load_workbook
import re
from itertools import chain
from sklearn.utils import shuffle
import nltk  
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import string
from nltk.stem.snowball import SnowballStemmer
from sklearn.decomposition import PCA

In [2]:
def load_romanian_data():
    wb = load_workbook('../articole.xlsx')
    sheet = wb['Foaie1']
    data_pos=[]
    data_neg=[]
    data_doubt=[]
    value = ''
    for i in range(1, len(sheet['B'])):
        value = str(sheet['B' + str(i)].value).replace('•', '') # remove special characters
        if bool(re.match('^(?=.*[a-zA-Z])', str(sheet['B' + str(i)].value))): # check if string not empty and has letters
            if sheet['A' + str(i)].value == 1:
                data_pos.append(value)
            elif sheet['A' + str(i)].value == 0:
                data_doubt.append(value)
            elif not sheet['A' + str(i)].value:
                data_neg.append(value)
    print("data pos len: " + str(len(data_pos)))
    print( "data neg len: " + str(len(data_neg)))
    
#     low_limit = 3000
#     high_limit = 10000
#     test_pos = data_pos[:low_limit]
#     train_pos = data_pos[low_limit:high_limit]
#     test_neg = data_neg[:low_limit]
    limit = 2400
    test_pos = data_pos[:limit]
    train_pos = data_pos[limit:]
    test_neg = data_neg[:limit]
    train_neg = data_neg[limit:len(data_pos)] # we make sure len of neg data = len of pos data(len neg > len pos initially)
    train_pos = [str(w).lower() for w in train_pos]
    train_pos = ([" ".join(j for j in w.split() if len(j) >= 2) for w in train_pos])

    train_neg = [str(w).lower() for w in train_neg]
    train_neg = ([" ".join(j for j in w.split() if len(j) >= 2) for w in train_neg])

    test_pos = [str(w).lower() for w in test_pos]

    test_neg = [str(w).lower() for w in test_neg]
    return train_pos, train_neg, test_pos, test_neg

In [3]:
def prepare_train_and_test(train_pos, train_neg, test_pos, test_neg):
    X_train = list(chain(train_pos, train_neg))
    y_train = np.concatenate((np.ones(len(train_pos), int), np.zeros(len(train_neg), int)))

    X_test = list(chain(test_pos, test_neg))
    y_test = np.concatenate((np.ones(len(test_pos), int), np.zeros(len(test_neg), int)))

    X_train_shuffled, y_train_shuffled =  shuffle(X_train, y_train)
    X_test_shuffled, y_test_shuffled = shuffle(X_test, y_test)

    return X_train_shuffled, y_train_shuffled, X_test_shuffled, y_test_shuffled

In [4]:
def remove_stopwords(sentences_list, updated_stopwords):
    filtered_sentence = []
    for sentence in sentences_list:
        filtered_sentence.append([w for w in sentence if not w in updated_stopwords])
    return repair_sentence(filtered_sentence)


def remove_punctuation(from_train_data):
    table = str.maketrans('', '', string.punctuation)
    no_punctuation = [w.translate(table) for w in from_train_data]
    numbers = re.compile('[0-9]')
    plain_text = [numbers.sub("", word) for word in no_punctuation]
    return plain_text
    

def remove_spaces(from_train_data):
    clean_spaces_data = []
    for sentence in from_train_data:
        clean_spaces_data.append(re.sub(' +', ' ', sentence).rstrip().lstrip())
    return clean_spaces_data

def repair_sentence(sentence_list):
    return [' '.join(map(str, element)) for element in sentence_list]


def update_stopwords(stopwords):
    do_no_remove_these_sw = ['not', 'no', 'can','has','have','had','must','shan','do', 'should','was','were','won',
                             'are','cannot','does','ain', 'could', 'did', 'is', 'might', 'need', 'would']
    return [word for word in stopwords if word not in do_no_remove_these_sw]


def stem_words(from_text):
    stemmer = SnowballStemmer("romanian")
    stemmer2 = SnowballStemmer("romanian", ignore_stopwords=True)
    return [" ".join([stemmer.stem(word) for word in sentence.split(" ")]) for sentence in from_text]


def lemmatize_words(sentence):
    from pywsd.utils import lemmatize_sentence
    return lemmatize_sentence(sentence)


def sentence_tokenization(sentence):
    return word_tokenize(sentence)


def sentence_punct_tokenization(sentence):
    return WordPunctTokenizer().tokenize(sentence)


def sentence_split_tokenization(sentence):
    return ([i for i in re.split(' ', sentence) if i])


def remove_apostrophe_words(train):
    train = [w.replace("it's", 'it is')
                     .replace("that's", "that is")
                     .replace("it 's", 'it is')
                     .replace("that 's", "that is")
                     .replace("'ve", " have")
                     .replace("' ve", " have")
                     .replace("won't", "will not")
                     .replace("wo n't", "will not")
                     .replace("don't", "do not")
                     .replace("do n't", "do not")
                     .replace("can't", "can not")
                     .replace("ca n't", "can not")
                     .replace("sha n't", "shall not")
                     .replace("shan't", "shall not")
                     .replace("n't", " not")
                     .replace("'re", " are")
                     .replace("'d", " would")
                     .replace("'ll", " will") for w in train]
    return train

In [5]:
def remove_empty_sentences(X, y):
    new_X = []
    new_y = []
    for i in range(len(X)):
        if len(X[i].split()) > 1:
            new_X.append(X[i])
            new_y.append(y[i])
    return new_X, new_y

In [6]:
def feature(X_train):
    stopwords = set(nltk.corpus.stopwords.words('romanian'))
    
    tokenized_sentence = []
    for sentence in X_train:
        tokenized_sentence.append(sentence_punct_tokenization(sentence))

    # NO STOP WORDS
    train_without_stopwords = remove_stopwords(tokenized_sentence, stopwords)

    # NO PUNCTUATION
    train_without_punctuation = remove_punctuation(train_without_stopwords)
    train_clean_spaces = remove_spaces(train_without_punctuation)

    # STEM WORDS
#     stemmed_train = stem_words(X_train)

    # LEMM WORDS
#     lemmatized_train = ([lemmatize_words(sentence) for sentence in X_train])

    # APPLY LITTLE FEATURES
    # feat_train = remove_stopwords(tokenized_sentence, updated_stopwords)
    # feat_train = remove_punctuation(feat_train)
    # feat_train = stem_words(feat_train)
    # feat_train = ([lemmatize_words(sentence) for sentence in feat_train])

    # return based on what features Ii want to extract
    # there are 5 possible ways of returning: extract stop words, extract punctuation,
    # word stem, word lemm, and all the features
    return train_clean_spaces # we choose to extract stop words

In [7]:
# Fetch the text data.
newsgroups_train = fetch_20newsgroups(subset = 'train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset = 'test', shuffle = True)

In [8]:
train_pos, train_neg, test_pos, test_neg = load_romanian_data()

data pos len: 11077
data neg len: 23509


In [9]:
with open("data/lemma/train_pos_lemma.txt") as pos_lemma:
    train_pos_lemma = pos_lemma.readlines()
with open(data/lemma/"train_neg_lemma.txt") as neg_lemma:
    train_neg_lemma = neg_lemma.readlines()

In [10]:
X_train, y_train, X_test, y_test = prepare_train_and_test(train_pos_lemma, train_neg_lemma, test_pos, test_neg)

In [11]:
feat_train = feature(X_train)
# feat_train, y_train = remove_empty_sentences(feat_train, y_train)

In [13]:
# Feature extraction (text vectorization) using term frequency - inverse document frequency.
tfidf_vect = TfidfVectorizer(ngram_range = (1,3))
pca = PCA(n_components=2)

X_train = remove_punctuation(X_train)
X_train = remove_spaces(X_train)

x_train = tfidf_vect.fit_transform(X_train)
x_test = tfidf_vect.transform(X_test)

# x_train_pca = pca.fit_transform(x_train)
# x_test_pca = pca.transform(x_test)


print('Number of training data is ' + str(x_train.shape[0]))
print('Number of test data is ' + str(x_test.shape[0]))
print('Data dimension is ' + str(x_train.shape[1]))
print()

TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.

In [None]:
idx=4
# print(str(y_train[idx]) + "\t" + feat_train[idx])
x_train

In [None]:

# 1. Multinomial Naive Bayes.
NB_clf = MultinomialNB(alpha = 0.03).fit(x_train, y_train)
predicted_test = NB_clf.predict(x_test)
predicted_train = NB_clf.predict(x_train)
print('========== 1. Multinomial Naive Bayes ==========')
print('The F-1 score for test query is ' + str(metrics.f1_score(y_test, predicted_test, average = 'macro')))
print('Training accuracy of naive bayes model is ' + str(np.mean(predicted_train == y_train)))
print('Test accuracy of naive bayes model is ' + str(np.mean(predicted_test == y_test)))
print('')

In [None]:
# 2. K-Nearest-Neighbors.
knn_clf = KNeighborsClassifier(n_neighbors = 2000)
knn_clf.fit(x_train, y_train)
predicted_test = knn_clf.predict(x_test)
predicted_train = knn_clf.predict(x_train)
print('========== 2. K-Nearest-Neighbors ==========')
print('The F-1 score for test query is ' + str(metrics.f1_score(y_test, predicted_test, average = 'macro')))
print('Training accuracy of KNN model is ' + str(np.mean(predicted_train == y_train)))
print('Test accuracy of KNN model is ' + str(np.mean(predicted_test == y_test)))
print('')

In [None]:
feat_train[:3]

In [None]:
# 3. Random Forest.
rf_clf = RandomForestClassifier(n_estimators = 200, max_depth = 100)
rf_clf.fit(x_train, y_train)
predicted_test = rf_clf.predict(x_test)
predicted_train = rf_clf.predict(x_train)
print('========== 3. Random Forest ==========')
print('The F-1 score for test query is ' + str(metrics.f1_score(y_test, predicted_test, average = 'macro')))
print('Training accuracy of random forest model is ' + str(np.mean(predicted_train == y_train)))
print('Test accuracy of random forest model is ' + str(np.mean(predicted_test == y_test)))
print('')

In [None]:
# 4. Linear Support Vector Machine
svm_clf = LinearSVC(loss = 'hinge', penalty = 'l2', tol = 1e-4, max_iter = 1000)
svm_clf.fit(x_train, y_train)
predicted_test = svm_clf.predict(x_test)
predicted_train = svm_clf.predict(x_train)
print('========== 4. Support Vector Machine with Linear Kernel ==========')
print('The F-1 score for test query is ' + str(metrics.f1_score(y_test, predicted_test, average = 'macro')))
print('Training accuracy of SVM model is ' + str(np.mean(predicted_train == y_train)))
print('Test accuracy of SVM model is ' + str(np.mean(predicted_test == y_test)))
print('')