<a href="https://colab.research.google.com/github/mansisinha/taddhita/blob/master/hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from scipy.sparse import csr_matrix, lil_matrix
from scipy import sparse
from abc import ABCMeta, abstractmethod
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
import random
import sklearn as sk
from sklearn.linear_model import LinearRegression, LassoLarsCV, RidgeCV
from sklearn.linear_model.base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator

In [0]:
class Base():
    __metaclass__ = ABCMeta
    def __init__(self,graph,max_iter=30):
        self.max_iter = max_iter
        self.graph = graph

    @abstractmethod
    def _build_propagation_matrix(self):
        raise NotImplementedError("Propagation matrix construction must be implemented to fit a model.")

    @abstractmethod
    def _build_base_matrix(self):
        raise NotImplementedError("Base matrix construction must be implemented to fit a model.")

    def _init_label_matrix(self):
        n_samples = self.graph.shape[0]
        n_classes = self.y_.max()+1
        return np.zeros((n_samples,n_classes))

    def _arrange_params(self):
        """Do nothing by default"""
        pass

    def fit(self,x,y):
        """Fit a graph-based semi-supervised learning model

        All the input data is provided array X (labeled samples only)
        and corresponding label array y.

        Parameters
        ----------
        x : array_like, shape = [n_labeled_samples]
            Node IDs of labeled samples
        y : array_like, shape = [n_labeled_samples]
            Label IDs of labeled samples

        Returns
        -------
        self : returns an instance of self.
        """
        self.x_ = x
        self.y_ = y

        self._arrange_params()

        self.F_ = self._init_label_matrix()

        self.P_ = self._build_propagation_matrix()
        self.B_ = self._build_base_matrix()

        remaining_iter = self.max_iter
        while remaining_iter > 0:
            self.F_ = self._propagate()
            remaining_iter -= 1

        return self

    def _propagate(self):
        return self.P_.dot(self.F_) + self.B_

    def predict(self,x):
        """Performs prediction based on the fitted model

        Parameters
        ----------
        x : array_like, shape = [n_samples]
            Node IDs

        Returns
        -------
        y : array_like, shape = [n_samples]
            Predictions for input node IDs
        """
        probas = self.predict_proba(x)
        return np.argmax(probas,axis=1)

    def predict_proba(self,x):
        """Predict probability for each possible label

        Parameters
        ----------
        x : array_like, shape = [n_samples]
            Node IDs

        Returns
        -------
        probabilities : array_like, shape = [n_samples, n_classes]
            Probability distributions across class labels
        """
        return (self.F_[x].T / np.sum(self.F_[x], axis=1)).T


class LGC(Base):
    """Local and Global Consistency (LGC) for GBSSL

    Parameters
    ----------
    alpha : float
      clamping factor
    max_iter : float
      maximum number of iterations allowed

    Attributes
    ----------
    x_ : array, shape = [n_samples]
        Input array of node IDs.

    Examples
    --------
    <<<

    References
    ----------
    Zhou, D., Bousquet, O., Lal, T. N., Weston, J., & Schölkopf, B. (2004).
    Learning with local and global consistency.
    Advances in neural information processing systems, 16(16), 321-328.
    """

    def __init__(self,graph,alpha=0.99,max_iter=30):
        super(LGC, self).__init__(graph,max_iter=30)
        self.alpha=alpha

    def _build_propagation_matrix(self):
        """ LGC computes the normalized Laplacian as its propagation matrix"""
        D2 = np.sqrt(sparse.diags((1.0/(self.graph.sum(1))).T.tolist()[0],offsets=0))
        S = D2.dot(self.graph).dot(D2)
        return self.alpha*S

    def _build_base_matrix(self):
        n_samples = self.graph.shape[0]
        n_classes = self.y_.max()+1
        B = np.zeros((n_samples,n_classes))
        B[self.x_,self.y_] = 1
        return (1-self.alpha)*B

class HMN(Base):
    """Harmonic funcsion (HMN) for GBSSL

    Parameters
    ----------
    max_iter : float
      maximum number of iterations allowed

    Attributes
    ----------
    x_ : array, shape = [n_samples]
        Input array of node IDs.

    Examples
    --------
    <<<

    References
    ----------
    Zhu, X., Ghahramani, Z., & Lafferty, J. (2003, August).
    Semi-supervised learning using gaussian fields and harmonic functions.
    In ICML (Vol. 3, pp. 912-919).
    """

    def _build_propagation_matrix(self):
        D = sparse.diags((1.0/(self.graph.sum(1))).T.tolist()[0],offsets=0)
        P = D.dot(self.graph)
        P[self.x_] = 0
        return P

    def _build_base_matrix(self):
        n_samples = self.graph.shape[0]
        n_classes = self.y_.max()+1
        B = np.zeros((n_samples,n_classes))
        B[self.x_,self.y_] = 1
        return B

class PARW(Base):
    """Partially Absorbing Random Walk (PARW) for GBSSL

    Parameters
    ----------
    lamb: float (default=0.001)
      Absorbing parameter
    max_iter : float
      maximum number of iterations allowed

    Attributes
    ----------
    x_ : array, shape = [n_samples]
        Input array of node IDs.

    Examples
    --------
    <<<

    References
    ----------
    Wu, X. M., Li, Z., So, A. M., Wright, J., & Chang, S. F. (2012).
    Learning with partially absorbing random walks.
    In Advances in Neural Information Processing Systems (pp. 3077-3085).
    """
    def __init__(self,graph,lamb=1.0,max_iter=30):
        super(PARW, self).__init__(graph,max_iter=30)
        self.lamb=lamb

    def _build_propagation_matrix(self):
        d = np.array(self.graph.sum(1).T)[0]
        Z = sparse.diags(1.0 / (d+self.lamb))
        P = Z.dot(self.graph)
        return P

    def _build_base_matrix(self):
        n_samples = self.graph.shape[0]
        n_classes = self.y_.max()+1
        B = np.zeros((n_samples,n_classes))
        B[self.x_,self.y_] = 1
        d = np.array(self.graph.sum(1).T)[0]
        Z = sparse.diags(1.0 / (d+self.lamb))
        Lamb = sparse.diags([self.lamb],shape=(n_samples,n_samples))
        return Z.dot(Lamb).dot(B)

class MAD(Base):
    """Modified Adsorption (MAD) for GBSSL

    Parameters
    ----------
    mu : array, shape = [3] > 0 (default = [1.0, 0.5, 1.0])
      Define importance among inj, cont, and abnd
    beta : float
      Used to determine p_inj_, p_cont_ and p_abnd_
    max_iter : float
      maximum number of iterations allowed

    Attributes
    ----------
    x_ : array, shape = [n_samples]
        Input array of node IDs.
    p_inj_ : array, shape = [n_samples]
      Probability to inject
    p_cont_ : array, shape = [n_samples]
      Probability to continue random walk
    p_abnd_ : array, shape = [n_samples]
        defined as 1 - p_inj - p_cont

    Examples
    --------
    <<<

    References
    ----------
    Talukdar, P. P., & Crammer, K. (2009).
    New regularized algorithms for transductive learning.
    In Machine Learning and Knowledge Discovery in Databases (pp. 442-457). Springer Berlin Heidelberg.
    """
    def __init__(self,graph,mu=np.array([1.0,0.5,1.0]),beta=2.0,max_iter=30):
        super(MAD, self).__init__(graph,max_iter=30)
        self.mu = mu
        self.beta = beta

    def _init_label_matrix(self):
        n_samples = self.graph.shape[0]
        n_classes = self.y_.max()+1
        return np.zeros((n_samples,n_classes+1)) # including dummy label

    def _build_normalization_term(self):
        W = self.graph.T.multiply(sparse.csr_matrix(self.p_cont_)).T
        d = np.array(W.sum(1).T)[0]
        dT = np.array(W.sum(0))[0]
        return sparse.diags(1.0/(self.mu[0]*self.p_inj_ + self.mu[1]*(d+dT) + self.mu[2]))

    def _build_propagation_matrix(self):
        Z = self._build_normalization_term()
        W = self.graph.T.multiply(sparse.csr_matrix(self.p_cont_)).T
        WT = W.T
        return Z.dot(self.mu[1]*(W+WT))

    def _build_base_matrix(self):
        n_samples = self.graph.shape[0]
        n_classes = self.y_.max()+1
        B = np.zeros((n_samples,n_classes+1)) # including dummy label
        B[self.x_,self.y_] = 1
        Z = self._build_normalization_term()
        S = sparse.diags(self.p_inj_)
        R = np.zeros((n_samples,n_classes+1))
        R[:,-1] = self.p_abnd_
        return Z.dot(self.mu[0]*S.dot(B)+self.mu[2]*R)

    def _arrange_params(self):
        P = sparse.csr_matrix(self.graph / np.maximum(self.graph.sum(1),1))
        logP = P.copy()
        logP.data = np.log(logP.data)
        H = - np.array(P.multiply(logP).sum(1).T)[0]
        c = np.log(self.beta) / np.log(self.beta+np.exp(H))
        d = np.zeros(self.graph.shape[0])
        d[self.x_] = (1-c[self.x_]) * np.sqrt(H[self.x_])
        z = np.maximum(c+d,1)
        self.p_inj_ = d / z
        self.p_cont_ = c / z
        self.p_abnd_ = 1 - self.p_inj_ - self.p_cont_

    def predict_proba(self,x):
        """Predict probability for each possible label

        Parameters
        ----------
        x : array_like, shape = [n_samples]
            Node IDs

        Returns
        -------
        probabilities : array_like, shape = [n_samples, n_classes]
            Probability distributions across class labels
        """
        return (self.F_[x,:-1].T / np.sum(self.F_[x,:-1], axis=1)).T

class OMNIProp(Base):
    """OMNI-Prop for GBSSL

    Parameters
    ----------
    lamb : float > 0 (default = 1.0)
      Define importance between prior and evidence from neighbors
    max_iter : float
      maximum number of iterations allowed

    Attributes
    ----------
    x_ : array, shape = [n_samples]
        Input array of node IDs.

    Examples
    --------
    <<<

    References
    ----------
    Yamaguchi, Y., Faloutsos, C., & Kitagawa, H. (2015, February).
    OMNI-Prop: Seamless Node Classification on Arbitrary Label Correlation.
    In Twenty-Ninth AAAI Conference on Artificial Intelligence.
    """

    def __init__(self,graph,lamb=1.0,max_iter=30):
        super(OMNIProp,self).__init__(graph,max_iter)
        self.lamb = lamb

    def _build_propagation_matrix(self):
        d = np.array(self.graph.sum(1).T)[0]
        dT = np.array(self.graph.sum(0))[0]
        Q = (sparse.diags(1.0/(d+self.lamb)).dot(self.graph)).dot(sparse.diags(1.0/(dT+self.lamb)).dot(self.graph.T))
        Q[self.x_] = 0
        return Q

    def _build_base_matrix(self):
        n_samples = self.graph.shape[0]
        n_classes = self.y_.max()+1
        unlabeled = np.setdiff1d(np.arange(n_samples),self.x_)

        dU = np.array(self.graph[unlabeled].sum(1).T)[0]
        dT = np.array(self.graph.sum(0))[0]
        n_samples = self.graph.shape[0]
        r = sparse.diags(1.0/(dU+self.lamb)).dot(self.lamb*self.graph[unlabeled].dot(sparse.diags(1.0/(dT+self.lamb))).dot(np.ones(n_samples))+self.lamb)

        b = np.ones(n_classes) / float(n_classes)

        B = np.zeros((n_samples,n_classes))
        B[unlabeled] = np.outer(r,b)
        B[self.x_,self.y_] = 1
        return B

class CAMLP(Base):
    """Confidence-Aware Modulated Label Propagation (CAMLP) for GBSSL

    Parameters
    ----------
    beta : float > 0 (default = 0.1)
      Define importance between prior and evidence from neighbors
    H : array_like, shape = [n_classes, n_classes]
      Define affinities between labels
      if None, identity matrix is set
    max_iter : float
      maximum number of iterations allowed

    Attributes
    ----------
    x_ : array, shape = [n_samples]
        Input array of node IDs.

    Examples
    --------
    <<<

    References
    ----------
    Yamaguchi, Y., Faloutsos, C., & Kitagawa, H. (2016, May).
    CAMLP: Confidence-Aware Modulated Label Propagation.
    In SIAM International Conference on Data Mining.
    """

    def __init__(self,graph,beta=0.1,H=None,max_iter=30):
        super(CAMLP,self).__init__(graph,max_iter)
        self.beta=beta
        self.H=H

    def _arrange_params(self):
        if self.H == None:
            n_classes = self.y_.max()+1
            self.H = np.identity(n_classes)

    def _propagate(self):
        return self.P_.dot(self.F_).dot(self.H) + self.B_

    def _build_normalization_term(self):
        d = np.array(self.graph.sum(1).T)[0]
        return sparse.diags(1.0/(1.0+d*self.beta))

    def _build_propagation_matrix(self):
        Z = self._build_normalization_term()
        return Z.dot(self.beta*self.graph)

    def _build_base_matrix(self):
        n_samples = self.graph.shape[0]
        n_classes = self.y_.max()+1
        B = np.ones((n_samples,n_classes))/float(n_classes)
        B[self.x_] = 0
        B[self.x_,self.y_] = 1
        Z = self._build_normalization_term()
        return Z.dot(B)


In [0]:
!pip install -q xlrd
!git clone https://github.com/mansisinha/taddhita.git
  

[31mtwisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.[0m
[31mgrin 1.2.1 requires argparse>=1.1, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
fatal: destination path 'taddhita' already exists and is not an empty directory.


In [0]:
!ls taddhita/newPairs.txt

taddhita/newPairs.txt


In [0]:
import pandas as pd
url='https://raw.githubusercontent.com/mansisinha/taddhita/master/allPairs.csv'

df=pd.read_csv(url)
df.drop(['affix','word','sense','meaning','Source Word Count','Derived word count','word_y'],axis=1,inplace=True)
#df['n']=1
df
#df.groupby('Affix').count()

Unnamed: 0,Affix,Source Word,Derived Word
0,ly,selfish,selfishly
1,ly,virulent,virulently
2,ly,gentle,gently
3,ly,harmonious,harmoniously
4,ly,guarded,guardedly
5,ly,gushing,gushingly
6,ly,habitual,habitually
7,ly,handy,handily
8,ly,haphazard,haphazardly
9,ly,happy,happily


In [0]:
import pandas as pd
df2=pd.read_csv('taddhita/newPairs.txt',header=None)
df2

Unnamed: 0,0,1,2
0,Free,Freeman,man
1,Free,Freedom,dom
2,of,ofor,or
3,of,ofer,er
4,of,ofan,an
5,the,theic,ic
6,the,theism,ism
7,the,thester,ster
8,the,theist,ist
9,the,theo,o


In [0]:
df2.drop(df2.index[13544], inplace=True)
df2

Unnamed: 0,0,1,2
0,Free,Freeman,man
1,Free,Freedom,dom
2,of,ofor,or
3,of,ofer,er
4,of,ofan,an
5,the,theic,ic
6,the,theism,ism
7,the,thester,ster
8,the,theist,ist
9,the,theo,o


In [0]:
df2.groupby(2).count()

Unnamed: 0_level_0,0,1
2,Unnamed: 1_level_1,Unnamed: 2_level_1
affix,3,3
age,214,214
al,377,377
ally,123,123
an,648,648
ation,118,118
dom,118,118
ee,206,206
en,996,996
er,1289,1289


In [0]:
df2['n']=0
df2

Unnamed: 0,0,1,2,n
0,Free,Freeman,man,0
1,Free,Freedom,dom,0
2,of,ofor,or,0
3,of,ofer,er,0
4,of,ofan,an,0
5,the,theic,ic,0
6,the,theism,ism,0
7,the,thester,ster,0
8,the,theist,ist,0
9,the,theo,o,0


In [0]:
df['indf']=1
df3=pd.merge(df2,df,how='left',left_on=[0,1,2],right_on=['Source Word','Derived Word','Affix'])
df3['n']=df3['n']+df3['indf'].fillna(0)
df3.drop(['Source Word','Derived Word','Affix','indf'],axis=1,inplace=True)
df3.columns=['Source Word','Derived Word','Affix','n']
df.drop(['indf'],axis=1,inplace=True)
df3

Unnamed: 0,Source Word,Derived Word,Affix,n
0,Free,Freeman,man,0.0
1,Free,Freedom,dom,0.0
2,of,ofor,or,0.0
3,of,ofer,er,0.0
4,of,ofan,an,0.0
5,the,theic,ic,0.0
6,the,theism,ism,0.0
7,the,thester,ster,0.0
8,the,theist,ist,0.0
9,the,theo,o,0.0


In [0]:
df3.groupby('n').count()

Unnamed: 0_level_0,Source Word,Derived Word,Affix
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,13446,13446,13446
1.0,1038,1038,1038


In [0]:
df['n']=1
comb = [df, df3]
df_3 = pd.concat(comb).drop_duplicates().reset_index(drop=True)
df_3

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Affix,Derived Word,Source Word,n
0,ly,selfishly,selfish,1.0
1,ly,virulently,virulent,1.0
2,ly,gently,gentle,1.0
3,ly,harmoniously,harmonious,1.0
4,ly,guardedly,guarded,1.0
5,ly,gushingly,gushing,1.0
6,ly,habitually,habitual,1.0
7,ly,handily,handy,1.0
8,ly,haphazardly,haphazard,1.0
9,ly,happily,happy,1.0


In [0]:
df_3['counts']=df_3.groupby('Affix')['n'].transform('count')
df_3['m']=1
cols=['Affix','n','counts']
cols1=['m']
df_3[['{}_sum'.format(x) for x in cols1]] = df_3.groupby(cols)[cols1].transform('sum')
df_3=df_3.sort_values('counts').reset_index(drop=True)
df_3

Unnamed: 0,Affix,Derived Word,Source Word,n,counts,m,m_sum
0,affix,nonaffix,non,0.0,3,1,3
1,affix,treeaffix,tree,0.0,3,1,3
2,affix,reaffix,re,0.0,3,1,3
3,proof,starproof,star,0.0,101,1,84
4,proof,shrinkproof,shrink,0.0,101,1,84
5,proof,pigproof,pig,0.0,101,1,84
6,proof,snakeproof,snake,0.0,101,1,84
7,proof,stabproof,stab,0.0,101,1,84
8,proof,dustproof,dust,0.0,101,1,84
9,proof,smokeproof,smoke,0.0,101,1,84


In [0]:
df_3.groupby('n').count()

Unnamed: 0_level_0,Affix,Derived Word,Source Word,counts,m,m_sum
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,13446,13446,13446,13446,13446,13446
1.0,7515,7515,7515,7515,7515,7515


In [0]:
df_3.loc[~(df_3['counts']<20)]
df4=df_3.loc[~(df_3['m_sum']<10)]
#df4=df_3.loc[~(df_3['Derived Word']=='nan')]
#df4=df_3.loc[~(df_3['Source Word']=='nan')]
df_final=df4.drop(['counts','m','m_sum'],axis=1).reset_index(drop=True)
df_final
    

Unnamed: 0,Affix,Derived Word,Source Word,n
0,proof,starproof,star,0.0
1,proof,shrinkproof,shrink,0.0
2,proof,pigproof,pig,0.0
3,proof,snakeproof,snake,0.0
4,proof,stabproof,stab,0.0
5,proof,dustproof,dust,0.0
6,proof,smokeproof,smoke,0.0
7,proof,touchproof,touch,0.0
8,proof,coldproof,cold,0.0
9,proof,heelproof,heel,0.0


In [0]:
df

Unnamed: 0,Affix,Source Word,Derived Word,n
0,ly,selfish,selfishly,1
1,ly,virulent,virulently,1
2,ly,gentle,gently,1
3,ly,harmonious,harmoniously,1
4,ly,guarded,guardedly,1
5,ly,gushing,gushingly,1
6,ly,habitual,habitually,1
7,ly,handy,handily,1
8,ly,haphazard,haphazardly,1
9,ly,happy,happily,1


In [0]:
df_final

Unnamed: 0,Affix,Derived Word,Source Word,n
0,proof,starproof,star,0.0
1,proof,shrinkproof,shrink,0.0
2,proof,pigproof,pig,0.0
3,proof,snakeproof,snake,0.0
4,proof,stabproof,stab,0.0
5,proof,dustproof,dust,0.0
6,proof,smokeproof,smoke,0.0
7,proof,touchproof,touch,0.0
8,proof,coldproof,cold,0.0
9,proof,heelproof,heel,0.0


In [0]:
df=df.drop(['n'],axis=1)
df_final=df_final.drop(['n'],axis=1)
df_final

Unnamed: 0,Affix,Derived Word,Source Word
0,proof,starproof,star
1,proof,shrinkproof,shrink
2,proof,pigproof,pig
3,proof,snakeproof,snake
4,proof,stabproof,stab
5,proof,dustproof,dust
6,proof,smokeproof,smoke
7,proof,touchproof,touch
8,proof,coldproof,cold
9,proof,heelproof,heel


In [0]:
from gensim.models.wrappers import FastText
model = FastText.load_fasttext_format('cc.en.300.bin', encoding="utf8")

In [0]:
y_label=np.zeros(df_final.shape[0],dtype=int)
cols = ['Source Word','Derived Word','Affix']
merged = pd.merge(df_final[cols], df, on=cols, how='outer', indicator=True)
for i in range (df_final.shape[0]):
    if merged['_merge'][i]=='both':
        y_label[i]=1
print(y_label)

[0 0 0 ... 1 1 1]


In [0]:
def substract(a, b):                              
    return a[len(b):]
def compute(x,y):
    for i in range (min((len(x)),len(y))):
        if x[i]!=y[i]:
            i-=1
            break
    return (substract(x, x[:i+1]),substract(y, x[:i+1]))

In [0]:
pattern=[]
for i in range (df_final.shape[0]):
    pattern.append(compute(df_final['Source Word'][i], df_final['Derived Word'][i]))
df_final['pattern'] = pattern
df_final.head(50)

Unnamed: 0,Affix,Derived Word,Source Word,pattern
0,proof,starproof,star,"(, proof)"
1,proof,shrinkproof,shrink,"(, proof)"
2,proof,pigproof,pig,"(, proof)"
3,proof,snakeproof,snake,"(, proof)"
4,proof,stabproof,stab,"(, proof)"
5,proof,dustproof,dust,"(, proof)"
6,proof,smokeproof,smoke,"(, proof)"
7,proof,touchproof,touch,"(, proof)"
8,proof,coldproof,cold,"(, proof)"
9,proof,heelproof,heel,"(, proof)"


In [0]:
df_final, y_label = shuffle(df_final, y_label, random_state=0)

In [0]:
sourcewords = []
derivedwords=[]
embedding1 = np.array([])
embedding2 = np.array([])
i = 0
k=0
words =[]
for word in df_final['Source Word']:
    # Getting token 
    # Appending the vectors
    try:
        embedding1 = np.append(embedding1, model[word])
    except KeyError:
        print(word)
        k +=1
        sourcewords.append(word)
        embedding1 = np.append(embedding1, np.zeros(300))
    i += 1

print(k)
print(i)
i = 0
k=0
for word in df_final['Derived Word']:
    # Getting token 
    # Appending the vectors 
    try:
        embedding2 = np.append(embedding2, model[word])
    except KeyError:
        k+=1
        print(word)
        derivedwords.append(word)
        embedding2 = np.append(embedding2, np.zeros(300))
    i += 1
print(k)
print(i)

# Reshaping the embedding vector 
embedding1 = embedding1.reshape(-1, 300)
embedding2 = embedding2.reshape(-1, 300)

ädel


  ngrams = [ng for ng in ngrams if ng in self.ngrams]


Tomé
hår
juiz
täv
dygn
xamül
œil
bäled
9
20958
nuen
ieie
teie
anuo
hocly
Xism
qor
IRCer
Aive
Toméan
DVDen
OAer
nuie
syy
håren
KSer
fyal
foxo
duic
qen
eoan
SIer
mopo
qism
xamülan
sawo
begy
ICQer
deie
gnuen
jish
pipy
myie
voyo
vifo
zive
vaen
sewo
yaal
Jewdom
taoy
sevy
pezo
FBIer
cior
45
20958


In [0]:
from collections import defaultdict
myset = set(pattern)
#print ((myset))
datadict = dict.fromkeys(myset,[])
#print(datadict)
for i in range(df_final.shape[0]):
    datadict[(compute(df_final.iloc[i]['Source Word'], df_final.iloc[i]['Derived Word']))]=(datadict[(compute(df_final.iloc[i]['Source Word'], df_final.iloc[i]['Derived Word']))]+[([i]+df_final.iloc[i].tolist())])
print('finished')


finished


In [0]:
def cosine_similarity(x1,x2):
    return np.dot(x1,x2.T)/(np.linalg.norm(x1)*np.linalg.norm(x2))
embedding3 = embedding2 - embedding1 
i=0
G = lil_matrix((df_final.shape[0],df_final.shape[0]), dtype='float32')
for key, values in datadict.iteritems():
    for value in values:
        maxind1 =-1
        maxval1 = -10.0
        maxind2 = -2
        maxval2 = -11.0
        ind=value[0]
        valo=value
        for val in values:
            if val==value:
                continue
            tempind=val[0]
            cos = cosine_similarity(embedding3[ind],embedding3[tempind])
            if cos > maxval2:
                maxind2 = tempind
                maxval2 = cos
            if cos > maxval1:
                maxind2 = maxind1
                maxval2 = maxval1
                maxind1 = tempind
                maxval1 =cos
        i+=1
        print(i)
        if maxind1>-1 and maxind2>-1 :
            G[ind,maxind1] = maxval1
            G[ind,maxind2] = maxval2
            print(df_final.iloc[ind].tolist(),df_final.iloc[maxind1].tolist(),df_final.iloc[maxind2].tolist())
        elif maxind1>-1 and maxind2<0 :
            G[ind,maxind1] = maxval1
            print(df_final.iloc[ind].tolist(),df_final.iloc[maxind1].tolist())

1
2
3
4
5
(['ful', 'sapful', 'sap', ('', 'ful')], ['ful', 'treeful', 'tree', ('', 'ful')], ['ful', 'honeyful', 'honey', ('', 'ful')])
6
(['ful', 'disdainful', 'disdain', ('', 'ful')], ['ful', 'fearful', 'fear', ('', 'ful')], ['ful', 'remorseful', 'remorse', ('', 'ful')])
7
(['ful', 'basketful', 'basket', ('', 'ful')], ['ful', 'bucketful', 'bucket', ('', 'ful')], ['ful', 'bagful', 'bag', ('', 'ful')])
8
(['ful', 'kindful', 'kind', ('', 'ful')], ['ful', 'typeful', 'type', ('', 'ful')], ['ful', 'greatful', 'great', ('', 'ful')])
9
(['ful', 'balful', 'bal', ('', 'ful')], ['ful', 'koful', 'ko', ('', 'ful')], ['ful', 'batful', 'bat', ('', 'ful')])
10
(['ful', 'artful', 'art', ('', 'ful')], ['ful', 'healthful', 'health', ('', 'ful')], ['ful', 'designful', 'design', ('', 'ful')])
11
(['ful', 'diseaseful', 'disease', ('', 'ful')], ['ful', 'increaseful', 'increase', ('', 'ful')], ['ful', 'heartful', 'heart', ('', 'ful')])
12
(['ful', 'gameful', 'game', ('', 'ful')], ['ful', 'playful', 'play', ('

  


1309
(['o', 'kamio', 'kami', ('', 'o')], ['o', 'balato', 'balat', ('', 'o')], ['o', 'baiko', 'baik', ('', 'o')])
1310
(['o', 'Malio', 'Mali', ('', 'o')], ['o', 'Senegalo', 'Senegal', ('', 'o')], ['o', 'Sudano', 'Sudan', ('', 'o')])
1311
(['o', 'millo', 'mill', ('', 'o')], ['o', 'ballo', 'ball', ('', 'o')], ['o', 'cemento', 'cement', ('', 'o')])
1312
(['o', 'amethysto', 'amethyst', ('', 'o')], ['o', 'rubyo', 'ruby', ('', 'o')], ['o', 'violeto', 'violet', ('', 'o')])
1313
(['o', 'Uniono', 'Union', ('', 'o')], ['o', 'uniono', 'union', ('', 'o')], ['o', 'republico', 'republic', ('', 'o')])
1314
(['o', 'capitalismo', 'capitalism', ('', 'o')], ['o', 'socialismo', 'socialism', ('', 'o')], ['o', 'fascismo', 'fascism', ('', 'o')])
1315
(['o', 'Albaniano', 'Albanian', ('', 'o')], ['o', 'Slovako', 'Slovak', ('', 'o')], ['o', 'Greeko', 'Greek', ('', 'o')])
1316
(['o', 'coloro', 'color', ('', 'o')], ['o', 'traffico', 'traffic', ('', 'o')], ['o', 'automatico', 'automatic', ('', 'o')])
1317
(['o', 's

In [0]:
#from scipy import sparse
#from gbssl import LGC,HMN,PARW,MAD,OMNIProp,CAMLP


In [0]:
y =y_label[:3000]
x=np.arange(3000)

In [0]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

lambs=np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0])
max_iters=np.array([5,10,15,20,25,30])
#grid={'lamb':lamb,'max_iter':max_iter}
best_params={'lamb':lambs[0],'max_iter':max_iters[0],'f1_score':0,'precision':0,'recall':0}
max_f1_score=0
for lamb in lambs:
    for max_iter in max_iters:
        omni = OMNIProp(graph=G,lamb=lamb,max_iter=max_iter)
        omni.fit(x,y)
        y_pred_omni = omni.predict_proba(np.arange(df_final.shape[0]))
        y_pred1_omni = np.zeros(df_final.shape[0])
        y_pred1_omni = [1 if (y_pred_omni[i][1] > y_pred_omni[i][0]) else 0 for i in range(df_final.shape[0]) ]
        temp_f1_score=f1_score(y_label, y_pred1_omni, average='macro')
        print(temp_f1_score,lamb,max_iter)
        if(max_f1_score<temp_f1_score):
            max_f1_score=temp_f1_score
            best_params['lamb']=lamb
            best_params['max_iter']=max_iter
            best_params['f1_score']=max_f1_score
            best_params['precision']=precision_score(y_label, y_pred1_omni, average='macro')
            best_params['recall']=recall_score(y_label, y_pred1_omni, average='macro')
            print('update',temp_f1_score,lamb,max_iter)
print("omni :")
print("f1_score is ", best_params['f1_score'] )
print("precision is ", best_params['precision'] )
print("recall is  ", best_params['recall'] )
print("lamb is ", best_params['lamb'] )
print("max_iter is ", best_params['max_iter'] )

(0.8159075559131939, 0.1, 5)
('update', 0.8159075559131939, 0.1, 5)
(0.8192032891458332, 0.1, 10)
('update', 0.8192032891458332, 0.1, 10)
(0.8191932763723568, 0.1, 15)
(0.8189608230935326, 0.1, 20)
(0.8189847805433585, 0.1, 25)
(0.8188685006949856, 0.1, 30)
(0.8163031612284626, 0.2, 5)
(0.8184797705813516, 0.2, 10)
(0.8185618280344398, 0.2, 15)
(0.8184923794038652, 0.2, 20)
(0.8184923794038652, 0.2, 25)
(0.8184923794038652, 0.2, 30)
(0.8166986197010533, 0.3, 5)
(0.8177110761356526, 0.3, 10)
(0.8180948108733204, 0.3, 15)
(0.8180948108733204, 0.3, 20)
(0.8180948108733204, 0.3, 25)
(0.8180948108733204, 0.3, 30)
(0.8164072701519258, 0.4, 5)
(0.8176961716143443, 0.4, 10)
(0.8176267985238479, 0.4, 15)
(0.8176267985238479, 0.4, 20)
(0.8176267985238479, 0.4, 25)
(0.8176267985238479, 0.4, 30)
(0.8162092402402767, 0.5, 5)
(0.8170555721021115, 0.5, 10)
(0.8171602313858062, 0.5, 15)
(0.8171602313858062, 0.5, 20)
(0.8171602313858062, 0.5, 25)
(0.8171602313858062, 0.5, 30)
(0.8159997323921608, 0.6, 

In [0]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

beta_random=np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0])
max_iter_camlp=np.array([10,15,20,25,30,35])
#grid={'beta':beta,'max_iter':max_iter}
best_params={'beta':beta_random[0],'max_iter':max_iter_camlp[0],'f1_score':0,'precision':0,'recall':0}
max_f1_score=0
for beta in beta_random:
    for max_iter in max_iter_camlp:
        camlp = CAMLP(graph=G,beta=beta,max_iter=max_iter)
        camlp.fit(x,y)
        #omni = OMNIProp(graph=G,lamb=lamb,max_iter=max_iter)
        #omni.fit(x,y)
        y_pred_camlp = camlp.predict_proba(np.arange(df_final.shape[0]))
        y_pred1_camlp = np.zeros(df_final.shape[0])
        y_pred1_camlp = [1 if (y_pred_camlp[i][1] > y_pred_camlp[i][0]) else 0 for i in range(df_final.shape[0]) ]
        temp_f1_score=f1_score(y_label, y_pred1_camlp, average='macro')
        print(temp_f1_score,beta,max_iter)
        if(max_f1_score<temp_f1_score):
            max_f1_score=temp_f1_score
            best_params['beta']=beta
            best_params['max_iter']=max_iter
            best_params['f1_score']=max_f1_score
            best_params['precision']=precision_score(y_label, y_pred1_camlp, average='macro')
            best_params['recall']=recall_score(y_label, y_pred1_camlp, average='macro')
            print('update',temp_f1_score,beta,max_iter)
print("camlp :")
print("f1_score is ", best_params['f1_score'] )
print("precision is ", best_params['precision'] )
print("recall is  ", best_params['recall'] )
print("beta is ", best_params['beta'] )
print("max_iter is ", best_params['max_iter'] )

(0.80070252144882, 0.1, 10)
('update', 0.80070252144882, 0.1, 10)
(0.8010611440417095, 0.1, 15)
('update', 0.8010611440417095, 0.1, 15)
(0.80098821662219, 0.1, 20)
(0.80098821662219, 0.1, 25)
(0.80098821662219, 0.1, 30)
(0.80098821662219, 0.1, 35)
(0.8004245528410108, 0.2, 10)
(0.8009023976622889, 0.2, 15)
(0.8009023976622889, 0.2, 20)
(0.8009023976622889, 0.2, 25)
(0.8009023976622889, 0.2, 30)
(0.8009023976622889, 0.2, 35)
(0.8005572882510386, 0.3, 10)
(0.801034866151098, 0.3, 15)
(0.801034866151098, 0.3, 20)
(0.801034866151098, 0.3, 25)
(0.801034866151098, 0.3, 30)
(0.801034866151098, 0.3, 35)
(0.8010015165376931, 0.4, 10)
(0.8014187534835286, 0.4, 15)
('update', 0.8014187534835286, 0.4, 15)
(0.8014187534835286, 0.4, 20)
(0.8014187534835286, 0.4, 25)
(0.8014187534835286, 0.4, 30)
(0.8014187534835286, 0.4, 35)
(0.8013391882972596, 0.5, 10)
(0.8017695509367743, 0.5, 15)
('update', 0.8017695509367743, 0.5, 15)
(0.8017695509367743, 0.5, 20)
(0.8017695509367743, 0.5, 25)
(0.80176955093677

In [0]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

alpha_random=np.arange(0.0,1.0,0.01)
max_iter_lgc=np.array([2,5,10,15,20,25,30])
best_params={'alpha':alpha_random[0],'max_iter':max_iter_lgc[0],'f1_score':0,'precision':0,'recall':0}
max_f1_score=0
for alpha in alpha_random:
    for max_iter in max_iter_lgc:
        lgc = LGC(graph=G,alpha=alpha,max_iter=max_iter)
        lgc.fit(x,y)
        y_pred_lgc = lgc.predict_proba(np.arange(df_final.shape[0]))
        y_pred1_lgc = np.zeros(df_final.shape[0])
        y_pred1_lgc = [1 if (y_pred_lgc[i][1] > y_pred_lgc[i][0]) else 0 for i in range(df_final.shape[0]) ]
        temp_f1_score=f1_score(y_label, y_pred1_lgc, average='macro')
        print(temp_f1_score,alpha,max_iter)
        if(max_f1_score<temp_f1_score):
            max_f1_score=temp_f1_score
            best_params['alpha']=alpha
            best_params['max_iter']=max_iter
            best_params['f1_score']=max_f1_score
            best_params['precision']=precision_score(y_label, y_pred1_lgc, average='macro')
            best_params['recall']=recall_score(y_label, y_pred1_lgc, average='macro')
            print('update',temp_f1_score,alpha,max_iter)
print("lgc :")
print("f1_score is ", best_params['f1_score'] )
print("precision is ", best_params['precision'] )
print("recall is  ", best_params['recall'] )
print("alpha is ", best_params['alpha'] )
print("max_iter is ", best_params['max_iter'] )

(0.5312524536937486, 0.0, 2)
('update', 0.5312524536937486, 0.0, 2)
(0.5312524536937486, 0.0, 5)
(0.5312524536937486, 0.0, 10)
(0.5312524536937486, 0.0, 15)
(0.5312524536937486, 0.0, 20)
(0.5312524536937486, 0.0, 25)
(0.5312524536937486, 0.0, 30)
(0.8007416781297663, 0.01, 2)
('update', 0.8007416781297663, 0.01, 2)
(0.8007416781297663, 0.01, 5)
(0.8007416781297663, 0.01, 10)
(0.8007416781297663, 0.01, 15)
(0.8007416781297663, 0.01, 20)
(0.8007416781297663, 0.01, 25)
(0.8007416781297663, 0.01, 30)
(0.8008014075394297, 0.02, 2)
('update', 0.8008014075394297, 0.02, 2)
(0.8008014075394297, 0.02, 5)
(0.8008014075394297, 0.02, 10)
(0.8008014075394297, 0.02, 15)
(0.8008014075394297, 0.02, 20)
(0.8008014075394297, 0.02, 25)
(0.8008014075394297, 0.02, 30)
(0.8007416781297663, 0.03, 2)
(0.8007416781297663, 0.03, 5)
(0.8007416781297663, 0.03, 10)
(0.8007416781297663, 0.03, 15)
(0.8007416781297663, 0.03, 20)
(0.8007416781297663, 0.03, 25)
(0.8007416781297663, 0.03, 30)
(0.8007818241057056, 0.04, 2

In [0]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

arr=[]
b=[]
for x in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    for y in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
        for z in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
            b=[x,y,z]
            arr.append(b)
max_iter_mad=np.array([20,30,40,50])
best_params={'mu':arr[0],'max_iter':max_iter_mad[0],'f1_score':0,'precision':0,'recall':0}
max_f1_score=0
for mu in arr:
    for max_iter in max_iter_mad:
        mad = MAD(graph=G,mu=mu,max_iter=max_iter)
        mad.fit(x,y)
        y_pred_mad = mad.predict_proba(np.arange(df_final.shape[0]))
        y_pred1_mad = np.zeros(df_final.shape[0])
        y_pred1_mad = [1 if (y_pred_mad[i][1] > y_pred_mad[i][0]) else 0 for i in range(df_final.shape[0]) ]
        temp_f1_score=f1_score(y_label, y_pred1_mad, average='macro')
        print(temp_f1_score,mu,max_iter)
        if(max_f1_score<temp_f1_score):
            max_f1_score=temp_f1_score
            best_params['mu']=mu
            best_params['max_iter']=max_iter
            best_params['f1_score']=max_f1_score
            best_params['precision']=precision_score(y_label, y_pred1_lgc, average='macro')
            best_params['recall']=recall_score(y_label, y_pred1_lgc, average='macro')
            print('update',temp_f1_score,mu,max_iter)
print("mad :(when tuning mu) ")
print("f1_score is ", best_params['f1_score'] )
print("precision is ", best_params['precision'] )
print("recall is  ", best_params['recall'] )
print("mu is ", best_params['mu'] )
print("max_iter is ", best_params['max_iter'] )

  logP.data = np.log(logP.data)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [0]:
beta_mad=np.array([1,2,3,4,5])
#print((beta_mad))
max_iter_mad=np.array([20,30,40,50])
best_params={'beta':beta_mad[0],'max_iter':max_iter_mad[0],'f1_score':0,'precision':0,'recall':0}
max_f1_score=0
for beta in beta_mad:
    for max_iter in max_iter_mad:
        mad = MAD(graph=G,beta=beta,max_iter=max_iter)
        mad.fit(x,y)
        y_pred_mad = mad.predict_proba(np.arange(df_final.shape[0]))
        y_pred1_mad = np.zeros(df_final.shape[0])
        y_pred1_mad= [1 if (y_pred_mad[i][1] > y_pred_mad[i][0]) else 0 for i in range(df_final.shape[0]) ]
        temp_f1_score=f1_score(y_label, y_pred1_mad, average='macro')
        print(temp_f1_score,beta,max_iter)
        if(max_f1_score<temp_f1_score):
            max_f1_score=temp_f1_score
            best_params['beta']=beta
            best_params['max_iter']=max_iter
            best_params['f1_score']=max_f1_score
            best_params['precision']=precision_score(y_label, y_pred1_mad, average='macro')
            best_params['recall']=recall_score(y_label, y_pred1_mad, average='macro')
            print('update',temp_f1_score,beta,max_iter)
print("mad :(when tuning beta) ")
print("f1_score is ", best_params['f1_score'] )
print("precision is ", best_params['precision'] )
print("recall is  ", best_params['recall'] )
print("beta is ", best_params['beta'] )
print("max_iter is ", best_params['max_iter'] )

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices