# Data TP

## Import librairies and databases

In [1]:
import pandas
import numpy

In [2]:
dataset = pandas.read_csv('.\\google\\data\\train.txt', delimiter=" ", names=["heads","tails","relation"])

In [3]:
dataset.describe()

Unnamed: 0,heads,tails,relation
count,573,573,573
unique,474,506,14
top,slow,california,capital-world
freq,5,12,116


In [4]:
dataset.shape

(573, 3)

In [5]:
dataset.head()

Unnamed: 0,heads,tails,relation
0,madrid,spain,capital-common-countries
1,ottawa,canada,capital-common-countries
2,cairo,egypt,capital-common-countries
3,paris,france,capital-common-countries
4,bern,switzerland,capital-common-countries


## Load Vectors

In [6]:
def load_vectors(file_path):
    resultDict = dict()
    with open(file_path) as f:
        raw_text = f.read()
        for raw_line in raw_text.split("\n")[:-1]:
            ind = raw_line.index(" ")
            resultDict[raw_line[:ind]] = numpy.array(raw_line[ind+1:].split(" "), dtype=numpy.float64)
        
    return resultDict

def formatPattern(algo, columns_value, file_name):
    return '.\\google\\{}\\subspaces\\{}\\{}.txt'.format(algo, columns_value, file_name)

In [7]:
vectors = {"heads": dict(), "tails": dict()}

for file_name in vectors.keys():
    for columns_value in set(dataset['relation'].values):
        file_dir = formatPattern('gloveCC', columns_value, file_name)
        vectors[file_name][columns_value] = load_vectors(file_dir)


## Replace values in dataframe

In [8]:
def minus_of_two_vector(v1, v2):
    return v2 - v1

def replace_values(dataframe, vectorsDict):
    resultdf = dataframe.to_dict()
    resultdf['tail-head'] = dict()
    to_remove = list()
    for indice in range(len(resultdf['relation'])):
        head = resultdf['heads'][indice]
        tail = resultdf['tails'][indice]
        relation = resultdf['relation'][indice]
        
        try:
            vh = vectorsDict['heads'][relation][head]
            resultdf['heads'][indice] = vh
            
            try:
                vt = vectorsDict['tails'][relation][tail]
                resultdf['tails'][indice] = vt
                
                resultdf['tail-head'][indice]  = minus_of_two_vector(vt, vh)
            except:
                print("no vector for (relation:%s, tail:%s) " % (relation, tail))
                to_remove.append(indice)
        except:
            print("no vector for (relation:%s, head:%s) " % (relation, head))
            to_remove.append(indice)
    
    for indice in to_remove:
        del(resultdf['heads'][indice])
        del(resultdf['tails'][indice])
        del(resultdf['relation'][indice])
        
    return resultdf


In [9]:
dataset2 = replace_values(dataset, vectors)

no vector for (relation:capital-world, head:funafuti) 
no vector for (relation:capital-world, head:ashgabat) 
no vector for (relation:capital-world, head:belmopan) 
no vector for (relation:capital-world, head:thimphu) 
no vector for (relation:capital-world, head:niamey) 
no vector for (relation:capital-world, head:nouakchott) 
no vector for (relation:capital-world, head:nuuk) 
no vector for (relation:capital-world, head:podgorica) 
no vector for (relation:capital-world, head:vaduz) 


In [10]:
dataset2.keys()

['tail-head', 'tails', 'heads', 'relation']

In [11]:
dataset2pd = pandas.DataFrame({
    'tails': dataset2['tails'].values(),
    'heads': dataset2['heads'].values(),
    'tail-head': dataset2['tail-head'].values(),
    'relation': dataset2['relation'].values()
})

In [12]:
dataset2pd.shape

(564, 4)

In [13]:
dataset2pd.head()

Unnamed: 0,heads,relation,tail-head,tails
0,"[0.26001, -0.53344, 0.22119, -0.16826, 0.58323...",capital-common-countries,"[0.37982000000000005, -0.544826, 0.07153999999...","[-0.11981, 0.011386, 0.14965, -0.22285, 0.7112..."
1,"[0.14219, 0.01274, -0.5254, -0.60931, 1.263, 0...",capital-common-countries,"[0.62044, -0.14634, -0.25031, 0.03849000000000...","[-0.47825, 0.15908, -0.27509, -0.6478, 0.76022..."
2,"[0.56629, -0.20355, -0.093943, -0.45198, 0.301...",capital-common-countries,"[0.6181589999999999, 0.061919999999999975, -0....","[-0.051869, -0.26547, 0.034687, -0.45404, 0.26..."
3,"[0.35213, -0.074228, -0.23725, -0.32726, 0.539...",capital-common-countries,"[0.51519, -0.527148, -0.09086999999999998, 0.3...","[-0.16306, 0.45292, -0.14638, -0.64332, 0.7901..."
4,"[0.57031, 0.077812, 0.062999, -0.23876, 0.1227...",capital-common-countries,"[0.45715, -0.088938, 0.34333899999999995, -0.0...","[0.11316, 0.16675, -0.28034, -0.18965, 0.61872..."


## Compute Sum of substraction of heads_i and tails_i for each relation_j

In [14]:
def sum_of_vectors(th, rel):
    tmp_vector = numpy.zeros(300, dtype=numpy.float64)
    tmp_relation = "capital-common-countries"
    resultDict = dict()
    
    for indice in range(len(rel)):
        if tmp_relation != rel[indice]:
            resultDict[tmp_relation] = tmp_vector
            tmp_vector = numpy.zeros(300, dtype=numpy.float64)
            tmp_relation = rel[indice]
        
        tmp_vector += th[indice]
    
    return resultDict
        

In [15]:
sigmoid_vectors_dict = sum_of_vectors(dataset2pd["tail-head"], dataset2pd["relation"])
sigmoid_vectors = pandas.DataFrame({
    "relation": sigmoid_vectors_dict.keys(),
    "tail-head": sigmoid_vectors_dict.values()
})

sigmoid_vectors

Unnamed: 0,relation,tail-head
0,gram3-comparative,"[3.8964215, 9.063753200000003, -3.39187844, 8...."
1,gram8-plural,"[-0.2853519999999997, 2.744684, 7.056154499999..."
2,capital-common-countries,"[14.132558, -5.571179, -1.0889667000000005, -0..."
3,city-in-state,"[23.971326700000006, -20.49283730000001, 3.294..."
4,family,"[-2.0846617999999997, -1.9598501999999993, 0.6..."
5,gram2-opposite,"[5.606883399999999, 3.4610384000000005, -10.07..."
6,currency,"[-1.0740174000000002, 11.681611100000001, -10...."
7,gram4-superlative,"[-0.5581699999999998, 7.434804200000002, -9.40..."
8,gram6-nationality-adjective,"[1.4133598999999997, 7.619784900000003, 5.9648..."
9,gram7-past-tense,"[-1.2233522, 1.5909859000000008, -9.7138505000..."


## Split Dataset into Xset and Yset

In [28]:
xset = sigmoid_vectors.iloc[:,[1]]
yset = sigmoid_vectors.iloc[:,[0]]

In [29]:
xset.head()

Unnamed: 0,tail-head
0,"[3.8964215, 9.063753200000003, -3.39187844, 8...."
1,"[-0.2853519999999997, 2.744684, 7.056154499999..."
2,"[14.132558, -5.571179, -1.0889667000000005, -0..."
3,"[23.971326700000006, -20.49283730000001, 3.294..."
4,"[-2.0846617999999997, -1.9598501999999993, 0.6..."


In [30]:
yset.head()

Unnamed: 0,relation
0,gram3-comparative
1,gram8-plural
2,capital-common-countries
3,city-in-state
4,family


## Use sigmoid function to awser the membership of one RDF tuple

In [31]:
def is_sigmoid(rfd_array, vectors_dict, rvector):
    
    try:
        rv = vectors_dict["heads"][rfd_array[2]][rfd_array[0]]
    except: 
        print("there is no vector equivalance for head %s: %s" % (rfd_array[0], rfd_array[2]))
        return 0

    try:
        fv = vectors_dict["tails"][rfd_array[2]][rfd_array[1]]
    except: 
        print("there is no vector equivalance for tail %s: %s" % (rfd_array[1], rfd_array[2]))
        return 0
    
    return (rv + rvector) * fv

### Some checks

In [32]:
print(xset.shape)
print(yset.shape)
print("there is exactly the same number of rows ? %r" % (yset.shape[0] == xset.shape[0]))

xsetLen = set()
for line in xset.iterrows():
    xsetLen.add(len(line[1]['tail-head']))
    
print("All rows contains vectors of 300 dim ?: %r" % (xsetLen == set([300])))


(13, 1)
(13, 1)
there is exactly the same number of rows ? True
All rows contains vectors of 300 dim ?: True


## Transform numpy array into dataframe row

In [46]:
def nparray_to_pandasrow(rows_list):
    xset = pandas.DataFrame(columns=range(300))
    
    for row in rows_list:
        xset.loc[len(xset)] = row[0]

    return xset

In [52]:
xset_formated = nparray_to_pandasrow(xset.values.tolist())

In [53]:
xset_formated.shape

(13, 300)

In [54]:
xset_formated.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,3.896422,9.063753,-3.391878,8.99829,4.755328,9.429323,-6.520929,-4.921704,4.22101,11.22974,...,4.62391,8.694409,-3.506263,-3.933668,-3.172144,1.785734,10.161419,5.556124,-9.589117,-7.443369
1,-0.285352,2.744684,7.056154,9.485303,5.100366,0.080983,3.867823,-6.730273,2.143074,-3.955902,...,4.099639,9.810066,0.222105,1.452095,6.597358,-4.978817,-1.703652,6.796394,0.063209,5.433376


## 10 Fold Cross Validation, SVM & PRFS

In [66]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cross_validation import KFold
from sklearn.svm import SVC

def get_mean_f1_from_svm(svm, xtest, ytest):
    return numpy.mean(precision_recall_fscore_support(ytest, svm.predict(xtest))[2])

def get_svm_class(xtrain,ytrain,c=1):
    return SVC(C=c).fit(xtrain,ytrain)

def get_for_svc_c(xtrain,ytrain,xtest,ytest,C=[0.1,1,10,100]):
    return [get_mean_f1_from_svm(get_svm_class(xtrain,ytrain,c),xtest,ytest) for c in C]

def split_and_get_mean_f1(xset,yset,n_folds=10,C=1):
    resultSet = list()
    kf = KFold(n=len(yset['relation']), n_folds=n_folds, shuffle=True, random_state=None)
    for train_index, test_index in kf:
        xtrain, xtest = xset.iloc[train_index], xset.iloc[test_index]
        ytrain, ytest = yset.iloc[train_index], yset.iloc[test_index]
        resultSet.append(get_for_svc_c(xtrain,ytrain,xtest,ytest))
    return resultSet

In [68]:
f1_array = split_and_get_mean_f1(xset_formated,yset)