Sveučilište u Zagrebu<br>
Fakultet elektrotehnike i računarstva

## Uvod u znanost o podacima

# Replikacija rezultata

In [1]:
import numpy as np
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import matplotlib.pyplot as plt
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

### Priprema podataka
U članku piše da su sve riječi u body-jima svih članaka pretvorene u lowercase. Također, uklonjene su sve *stopwords*, a interpunkcijski znakovi zamijenjeni su razmacima. Dakle, prvo sam se vratila u prethodnu vježbu i pohranila modificirani dataframe koji sam tamo bila napravila u "clanci_stripped.csv" datoteku koju ću dalje koristiti.

In [2]:
clanci_stripped = pd.read_csv("clanci_stripped.csv", index_col = 0)
clanci_stripped.head()

Unnamed: 0,TOPICS,PLACES,PEOPLE,ORGS,EXCHANGES,TITLE,DATELINE,TOPICS_ENUM,FILENAME,DATE,BODY,TEXT TYPE,LEWISSPLIT,CGISPLIT
0,['cocoa'],"['el-salvador', 'usa', 'uruguay']",,,,BAHIA COCOA REVIEW,"SALVADOR, Feb 26 -",2,reut2-000.sgm,26-FEB-1987 15:01:01.79,showers continued throughout the week in the b...,0,2,1
1,,['usa'],,,,STANDARD OIL &lt;SRD> TO FORM FINANCIAL UNIT,"CLEVELAND, Feb 26 -",1,reut2-000.sgm,26-FEB-1987 15:02:20.00,standard oil co and bp north america inc said ...,0,2,1
2,,['usa'],,,,TEXAS COMMERCE BANCSHARES &lt;TCB> FILES PLAN,"HOUSTON, Feb 26 -",1,reut2-000.sgm,26-FEB-1987 15:03:27.51,texas commerce bancshares incs texas commerce ...,0,2,1
3,,"['usa', 'brazil']",,,,TALKING POINT/BANKAMERICA &lt;BAC> EQUITY OFFER,"LOS ANGELES, Feb 26 -",1,reut2-000.sgm,26-FEB-1987 15:07:13.72,bankamerica corp is not under pressure to act ...,0,2,1
4,"['grain', 'wheat', 'corn', 'barley', 'oat', 's...",['usa'],,,,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,"WASHINGTON, Feb 26 -",2,reut2-000.sgm,26-FEB-1987 15:10:44.60,the u s agriculture department reported the f...,0,2,1


Bitni su nam samo stupci TOPICS i BODY tako da ostale možemo izbaciti

In [3]:
clanci_stripped = clanci_stripped.loc[:, ["TOPICS", "BODY"]]
clanci_stripped = clanci_stripped.loc[clanci_stripped.TOPICS.notnull(), :]
#print(clanci.head(n=10))
mapa = {'TOPICS': [], 'BODY': []}
for index, row in clanci_stripped.iterrows():
    topics = row.TOPICS.split(",")
    for topic in topics:
        topic = topic.replace('[', "")
        topic = topic.replace("]", "")
        if mapa['TOPICS'] is None:
            mapa['TOPICS'] = [topic]
        else:
            mapa['TOPICS'].append(topic)
        # izbacujemo non single unicode characters
        body = row.BODY
        unfiltered_body = ""
        for word in body.split(" "):
            if len(word) > 3:
                unfiltered_body+=word
                unfiltered_body+=" "
        filtered_body = ""
        for character in unfiltered_body:
            if (character.isalnum()) or (character == ' '):
                filtered_body += character
        filtered_body = filtered_body.replace(' [ ]+', ' ')
        
        if mapa['BODY'] is None:
            mapa['BODY'] = [filtered_body]
        else:
            mapa['BODY'].append(filtered_body)
            
dataframe = pd.DataFrame(mapa)
dataframe.to_csv("clanci_reduced.csv")
dataframe.head(n=10)

Unnamed: 0,TOPICS,BODY
0,'cocoa',showers continued throughout week bahia cocoa ...
1,'grain',agriculture department reported farmer owned r...
2,'wheat',agriculture department reported farmer owned r...
3,'corn',agriculture department reported farmer owned r...
4,'barley',agriculture department reported farmer owned r...
5,'oat',agriculture department reported farmer owned r...
6,'sorghum',agriculture department reported farmer owned r...
7,'veg-oil',argentine grain board figures show crop regist...
8,'linseed',argentine grain board figures show crop regist...
9,'lin-oil',argentine grain board figures show crop regist...


## Kernels

In [None]:
!pip install Cython

In [None]:
import pyximport; pyximport.install(setup_args={"script_args":["--compiler=mingw32"],
                              "include_dirs":np.get_include()},
                  reload_support=True)
from string_kernel import ssk, string_kernel

In [None]:
print(ssk("science is organized knowledge", "wisdom is organized life", n=4, lbda=1, accum=True))

In [11]:
# WK - standard word kernel
# WK is a linear kernel that measures the similarity between documents
# that are indexed by words with tfidf weighting sheme

from sklearn.feature_extraction.text import TfidfVectorizer

# tf - term frequency
# df - document frequency
# n - total number of document

def wk_kernel(X, Y):
    n = len(X)
    kernel_matrix = np.zeros([len(X), len(X)])
    for i in range(0, len(X)):
        for j in range(0, len(X)):
            kernel_matrix[i][j] = wk(X[i], X[j])
    return kernel_matrix

In [12]:
# NGK - n-grams kernel
# NGK is a linear kernel that returns a similarity score between documents
# that are indexed by n-grams
# vrijednost jezgrene funkcije
def ngk(string1, string2):
    def ngrams(string):
        ngrams = set(())
        for n in range(1, len(string)+1):
            ngrams_helper = zip(*[string[i:] for i in range(n)])
            for ngram in ngrams_helper:
                ngrams.add(''.join(ngram))
        #print(ngrams)
        return ngrams
    
    ngrams_1 = ngrams(string1) # racuna ngrams za prvi dokument
    ngrams_2 = ngrams(string2) # racuna ngrams za drugi dokument
    
    # usporeduje broj jednakih ngrams oba dokumenta
    intercept_rez = ngrams_1.intersection(ngrams_2)
    num_common = len(intercept_rez)
    
    rez = num_common/(len(ngrams_1)+len(ngrams_2))
    rez = rez/0.5 #skaliranje
    return rez

def ngk_kernel(X1, X2):
    kernel_matrix = np.zeros([len(X1), len(X2)])
    for i in range(0, len(X1)):
        for j in range(0, len(X2)):
            kernel_matrix[i][j] = ngk(X1[i], X1[j])
    return kernel_matrix

print(ngk_kernel("car","cat"))

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


## Experimental Results

Ciljevi eksperimenata su:
- proučavati utjecaj promjene parametara k(duljina) i $\lambda$(težina)
- uočiti prednosti kombiniranja različitih jezgri

Eksperimenti su provedeni samo na dijelu Reuters seta. U članku piše da je subset bio veličine 470 dokumenata, od čega je 380 bilo korišteno za treniranje, a 90 za ispitivanje.

U eksperimentu su odabrane kategorij "earn", "acq", "crude" i "corn".


In [41]:
earn_clanci = dataframe[dataframe.TOPICS.str.contains("earn")]
acq_clanci = dataframe[dataframe.TOPICS.str.contains("acq")]
crude_clanci = dataframe[dataframe.TOPICS.str.contains("crude")]
corn_clanci = dataframe[dataframe.TOPICS.str.contains("corn")]

clanci = [earn_clanci, acq_clanci, crude_clanci, corn_clanci]
earn_clanci.head()

Unnamed: 0,TOPICS,BODY
19,'earn',champion products said board directors approve...
21,'earn',dlrs assets deposits loans note available year...
22,'earn',ohio mattress said first quarter ending februa...
24,'earn',oper loss profit seven oper profit profit revs...
25,'earn',revs nine mths dlrs dlrs revs billion reuter


Navedeno je da je broj članaka za pojedinu kategoriju za učenje (ispitivanje) sljedeći:
1. earn 152 (40)
2. acquisition 114 (25)
3. crude 76 (15)
4. corn 38 (10)

In [42]:
from sklearn.model_selection import train_test_split

[earn_train, earn_test] = train_test_split(earn_clanci, train_size=152/len(earn_clanci), test_size=40/len(earn_clanci))
[acq_train, acq_test] = train_test_split(acq_clanci, train_size=114/len(acq_clanci), test_size=25/len(acq_clanci))
[crude_train, crude_test] = train_test_split(crude_clanci, train_size=76/len(crude_clanci), test_size=15/len(crude_clanci))
[corn_train, corn_test] = train_test_split(corn_clanci, train_size=38/len(corn_clanci), test_size=10/len(corn_clanci))

y_train = []
y_train.extend(['earn' for i in range(0, len(earn_train))])
y_train.extend(['acq' for i in range(0, len(acq_train))])
y_train.extend(['crude' for i in range(0, len(crude_train))])
y_train.extend(['corn' for i in range(0, len(corn_train))])
y_train = np.array(y_train)
#print(y_train)

y_test = []
y_test.extend(['earn' for i in range(0, len(earn_test))])
y_test.extend(['acq' for i in range(0, len(acq_test))])
y_test.extend(['crude' for i in range(0, len(crude_test))])
y_test.extend(['corn' for i in range(0, len(corn_test))])
y_test = np.array(y_test)
#print(y_test)

clanci_train = [earn_train, acq_train, crude_train, corn_train]
clanci_train = pd.concat(clanci_train)
clanci_train.to_csv("clanci_train.csv")
clanci_train

Unnamed: 0,TOPICS,BODY
8663,'earn',caesars world said directors unanimously appro...
4287,'earn',loss five loss loss loss sales shrs year loss ...
2039,'earn',advo system said could report break even secon...
784,'earn',loss dlrs loss loss loss revs nine mths loss d...
128,'earn',oper oper revs shrs mths oper oper revs shrs n...
...,...,...
6330,'corn',french operators have requested licences expor...
7964,'corn',japan appears relying less corn from china arg...
5328,'corn',rain over wide areas raised prospect good food...
374,'corn',french operators last friday requested licence...


### Effectiveness of Varying Sequence Length

Za svaku vrijednost k, eksperiment je proveden 10 puta i onda su dobivene vrijednosti mean i sd. Lambda je postavljen na 0.5.


In [43]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.svm import SVC

def ssk_evaluation(category, X_test, y_true, k_range=(5, 5)):
    print("SSK, category =", category)
    print("k\tmean(F1)\tSTD(F1)\t\tmean(precision)\t\tSTD(precision)\t\tmean(recall)\t\tSTD(recall)")
    treniranje = []
    for index, row in clanci_train.iterrows():
        ''.join(r'\u{:04X}'.format(ord(chr)) for chr in test_str)
        lista = row['BODY'].split(" ")
        lista = [el for el in lista if el != '']
        treniranje.append(lista)
    treniranje = np.array(treniranje)
    treniranje = treniranje.reshape(len(treniranje), 1)
    y_treniranje = y_train.reshape(len(y_train), 1)
    
    testiranje = []
    for index, row in X_test.iterrows():
        lista = row['BODY']#.split(" ")
        testiranje.append(lista)
    testiranje = np.array(testiranje)
    testiranje = testiranje.reshape(len(testiranje), 1)
    y_true = y_true.reshape(len(y_true), 1)
    
    #print(treniranje)
    for k in range(k_range[0], k_range[1]+1):
        ssk_model = SVC(kernel=string_kernel(n=k, lbda=0.5))
        ssk_model.fit(treniranje, y_treniranje)
        f1 = []
        precision = []
        recall = []
        for i in range(0, 10):
            y_pred = ssk_model.predict(testiranje)
            f1.append(f1_score(y_true, y_pred))
            precision.append(precision(y_true, y_pred))
            recall.append(precision(y_true, y_pred))
        f1_mean = mean(f1)
        f1_sd = std(f1)
        precision_mean = mean(precision)
        precision_sd = std(precision)
        recall_mean = mean(recall)
        recall_sd = std(recall)
        print("{}\t{}\t{}\t\t{}\t\t{}\t\t{}\t\t{}\n".format(k, f1_mean, f1_std, precision_mean, precision_sd, recall_mean, recall_sd))   

ssk_evaluation("earn", earn_test, np.array(['earn' for i in range(0, len(earn_test))]), k_range=(3, 14))
ssk_evaluation("acq", acq_test, np.array(['acq' for i in range(0, len(acq_test))]), k_range=(3, 14))
ssk_evaluation("crude", crude_test, np.array(['crude' for i in range(0, len(crude_test))]), k_range=(3, 14))
ssk_evaluation("corn", corn_test, np.array(['corn' for i in range(0, len(corn_test))]), k_range=(3, 14))


SSK, category = earn
k	mean(F1)	STD(F1)		mean(precision)		STD(precision)		mean(recall)		STD(recall)
['caesars', 'world', 'said', 'directors', 'unanimously', 'approved', 'recapitalization', 'plan', 'under', 'which', 'stockholders', 'will', 'cash', 'distribution', 'dlrs', 'share', 'time', 'special', 'cash', 'dividend', 'will', 'retain', 'their', 'common', 'stock', 'ownership', 'caesars', 'world', 'caesars', 'world', 'said', 'expects', 'raise', 'approximately', 'billion', 'dlrs', 'needed', 'share', 'dividend', 'expenses', 'recapitalization', 'through', 'around', 'dlrs', 'bank', 'borrowings', 'public', 'sale', 'approximately', 'dlrs', 'debt', 'some', 'outstanding', 'debt', 'will', 'retired', 'drexel', 'burnham', 'lambert', 'caesars', 'financial', 'advisor', 'told', 'company', 'confident', 'arrange', 'entire', 'financing', 'needed', 'recapitalization', 'henry', 'gluck', 'chairman', 'chief', 'executive', 'officer', 'hotel', 'casino', 'resorts', 'company', 'said', 'statement', 'board', 'belie

ValueError: only single character unicode strings can be converted to Py_UCS4, got length 7

### Effectiveness of Varying Weight Decay Factors

k je postavljen na 5.

### Effectiveness of Combining Kernels

In [None]:
## MOJ SSK

import itertools

# SSK - string subsequence kernel
def is_subsequence(subsequence, word):
    iterator = iter(word)
    if all(c in iterator for c in subsequence):
        return True
    else:
        return False

def ssk_kernel(string1, string2, k=2, lambd=1):
    stupci = []
    tablica = {}

    for word in [string1, string2]:
        letters = list(word)
        for combination in itertools.combinations(letters, k): # nalazi sve kombinacije slova u letters duljine k
            s = ''.join(combination)
            if s not in stupci:
                stupci.append(s)
    
    #print(stupci)

    for word in [string1, string2]:
        tablica[word] = [0 for i in range(len(stupci))]
        subsequence_index = 0
        for stupac in stupci:
            if is_subsequence(stupac, word):
                cell_rez = 1
                index_slova_rijeci = 0
                for index_slova_stupca in range(len(stupac) - 1):
                    cell_rez += word.index(stupac[index_slova_stupca+1], word.index(stupac[index_slova_stupca])+1)-word.index(stupac[index_slova_stupca])
                tablica[word][subsequence_index] = pow(lambd, cell_rez)
                #print(word, tablica[word])
                # res += i.index(j[ki+1], i.index(j[ki])+1)-i.index(j[ki])
            subsequence_index += 1

    red_1 = np.array(tablica[string1])
    red_2 = np.array(tablica[string2])
    rez_1 = np.sum(red_1*red_2.T)
    rez_2 = np.sum(red_1*red_1.T)
    rez_3 = np.sum(red_2*red_2.T)
    rez = rez_1/pow(rez_2*rez_3, 0.5)

    return rez

print(ssk_kernel("car", "cat", lambd=2))

Ako treniramo modele s punim člancima, onda radi presporo tako da sam odlučila iz svakog članka izdvojiti n = *** najčešćih riječi i onda po njima uspoređivati članke

In [11]:
from collections import Counter

def return_n_most_common_words(row, n):
    words_in_row = row['BODY'].split(" ")
    count = Counter()
    for word in words_in_row:
        if len(word) > 3:
            count[word] += 1
    lista = np.array([])
    for (element, _) in count.most_common(n):
        lista = np.append(lista, element)
    #row['MOST_COMMON'] = lista
    #print(lista)
    return lista

In [12]:
earn_clanci['MOST_COMMON'] = earn_clanci.apply(lambda row: return_n_most_common_words(row, n=50), axis=1)
acq_clanci['MOST_COMMON'] = acq_clanci.apply(lambda row: return_n_most_common_words(row, n=50), axis=1)
crude_clanci['MOST_COMMON'] = crude_clanci.apply(lambda row: return_n_most_common_words(row, n=50), axis=1)
corn_clanci['MOST_COMMON'] = corn_clanci.apply(lambda row: return_n_most_common_words(row, n=50), axis=1)
earn_clanci

Unnamed: 0,TOPICS,BODY,MOST_COMMON
19,'earn',champion products inc said its board of direct...,"[said, board, stock, shares, shareholders, apr..."
21,'earn',shr cts vs dlrs net vs ...,"[dlrs, assets, deposits, loans, note, availabl..."
22,'earn',ohio mattress co said its first quarter endin...,"[said, quarter, first, acquisitions, dlrs, sea..."
24,'earn',oper shr loss two cts vs profit seven cts ...,"[profit, oper, loss, revs, shrs, mths, seven, ..."
25,'earn',shr one dlr vs cts net mln vs ...,"[revs, dlrs, nine, mths, billion, reuter]"
...,...,...,...
13058,'earn',shr loss nine cts vs loss cts net loss ...,"[loss, dlrs, capitalized, costs, nine, revs, s..."
13059,'earn',shr cts vs cts shr diluted cts vs...,"[diluted, shrs, sales, nine, mths, dlrs, reuter]"
13060,'earn',shr cts vs cts net mln vs ...,"[dlrs, sales, shrs, nine, mths, oper, billion,..."
13103,'earn',nine months ended august group shr ...,"[billion, group, nine, months, ended, august, ..."
