In [1]:
#Importing library
import warnings
warnings.filterwarnings("ignore")

import csv
import pandas as pd
import datetime 
import time 
import numpy as np
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns
from matplotlib import rcParams
from sklearn.cluster import MiniBatchKMeans, KMeans
import math
import pickle
import os
import xgboost as xgb
import warnings
import networkx as nx
import pdb
import pickle
from pandas import HDFStore,DataFrame
from pandas import read_hdf
from scipy.sparse.linalg import svds, eigs
import gc
from tqdm import tqdm

<h3>1. Reading Data</h3>


In [2]:
if os.path.isfile('data/after_eda/train_pos_after_eda.csv'):
    train_graph=nx.read_edgelist('data/after_eda/train_pos_after_eda.csv', delimiter=',',create_using=nx.DiGraph(), nodetype=int)
    print(nx.info(train_graph))
else:
    print("File is not stored in folder, perform EDA and store it in folder.")

DiGraph with 1780722 nodes and 7550015 edges


<h3>2. Similarity Measures</h3>


<h4>2.1. Jaccard Distance </h4>
\begin{equation}
jaccard = \frac{|X\cap Y|}{|X \cup Y|} 
\end{equation}

In [17]:
#For followees
def jaccard_for_followees(a,b):
    try:
        if len(set(train_graph.successors(a)))==0 | len(set(train_graph.successors(b))):
            return 0
        sim=(len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
        (len(set(train_graph.successors(a)).union(set(train_graph.successors(b)))))
    except:
        return 0
    return sim

In [18]:
print(jaccard_for_followees(273084,1505602))
print(jaccard_for_followees(1,255))

0.0
0.0


In [19]:
#for followers
def jaccard_for_followers(a,b):
    try:
        if len(set(train_graph.predecessors(a)))==0 | len(set(train_graph.predecessors(b))):
            return 0
        sim=(len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
        (len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b)))))
    except:
        return 0
    return sim


In [20]:
print(jaccard_for_followers(273084,1505602))
print(jaccard_for_followers(253084,1605602))

0.0
0.0


<h4>2.2. Cosine Distance </h4>
\begin{equation}
CosineDistance =\frac{|X\cap Y|}{\sqrt{|X|\cdot|Y|}} 
\end{equation}

In [21]:
#For followees
def cosine_for_followees(a,b):
    try:
        if len(set(train_graph.successors(a)))==0 | len(set(train_graph.successors(b))):
            return 0
        sim=(len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
        (math.sqrt(len(set(train_graph.successors(a)))*len((set(train_graph.successors(b))))))
    except:
        return 0
    return sim

In [24]:
print(cosine_for_followees(273084,1505602))
print(cosine_for_followees(273084,1635354))
print(cosine_for_followees(1,255))

0.0
0
0.0


In [27]:
#For followers
def cosine_for_followers(a,b):
    try:
        if len(set(train_graph.predecessors(a)))==0 | len(set(train_graph.predecessors(b))):
            return 0
        sim=(len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
        (math.sqrt(len(set(train_graph.predecessors(a)))*len((set(train_graph.predecessors(b))))))
    except:
        return 0
    return sim

In [31]:
print(cosine_for_followers(273084,1505602))
print(cosine_for_followers(273084,1635354))
print(cosine_for_followers(2,25))

0.0
0
0.0


In [33]:
print(cosine_for_followers(2,470294))
print(cosine_for_followers(2,470264))

0.12909944487358055
0


<h3>3. Ranking Measure</h3>

Page rank computes the ranking of the node in the graph G based on the structure of the incoming links.

<h4>3.1. Page Ranking</h4>

In [34]:
if not os.path.isfile('data/fea_sample/page_rank.p'):
    pr=nx.pagerank(train_graph, alpha=0.85)
    pickle.dump(pr, open('data/fea_sample/page_rank.p', 'wb'))
else:
    pr=pickle.load(open('data/fea_sample/page_rank.p', 'rb'))

In [36]:
print('min: ', pr[min(pr, key=pr.get)])
print('max: ', pr[max(pr, key=pr.get)])
print('mean: ', float(sum(pr.values())/len(pr)))
mean_pr=float(sum(pr.values())/len(pr))
print(mean_pr)

min:  1.6556497245737814e-07
max:  2.7098251341935827e-05
mean:  5.615699699389075e-07
5.615699699389075e-07


<h3>4. Other Graph Features</h3>

<h4>4.1. Shortest Path</h4>
Getting shortest path between two nodes, if nodes have direct path then we remove that edge and calculate the shortest path.

In [37]:
def compute_shortest_path_length(a,b):
    p=-1
    try:
        if train_graph.has_edge(a,b):
            train_graph.remove_edge(a,b)
            p=nx.shortest_path_length(train_graph, source=a, target=b)
            train_graph.add_edge(a,b)
        else:
            p=nx.shortest_path_length(train_graph, source=a, target=b)
        return p
    except:
        return -1
            

In [40]:
print(compute_shortest_path_length(77697, 826021))
print(compute_shortest_path_length(669354, 1635354))
print(compute_shortest_path_length(1,1189))

10
-1
8


<h4>4.2. Checking for Same Community</h4>

In [41]:
#Getting weakly connected edges from the graph
wcc=list(nx.weakly_connected_components(train_graph))


In [42]:
def belongs_to_same_wcc(a,b):
    index=[]
    if train_graph.has_edge(b,a):
        return 1
    if train_graph.has_edge(a,b):
        for i in wcc:
            if a in i:
                index=i
                break
        if (b in index):
            train_graph.remove_edge(a,b)
            if compute_shortest_path_length(a,b)==-1:
                train_graph.add_edge(a,b)
                return 0
            else:
                train_graph.add_edge(a,b)
                return 1
        else:
            return 0
    else:
        for i in wcc:
            if a in i:
                index=i
                break
        if (b in index):
            return 1
        else:
            return 0
        
                
            
    

In [43]:
print(belongs_to_same_wcc(861, 1659750))
print(belongs_to_same_wcc(669354, 1659750))
print(belongs_to_same_wcc(669354, 1635354))

0
0
0


<h4>4.3. Adamic/Adar Index. </h4>
Adamic/Adar index measure is defined as inverted sum of the degrees of common neighbors for given two vertices.
$$Adar(x,y)=\sum_{u \in N(x) \cap N(y)}\frac{1}{log(|N(u)|)}$$

In [53]:
def calc_adar_in(a,b):
    sum=0
    try:
        n=list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
        if len(n)!=0:
            for i in n:
                sum=sum+(1/np.log10(len(list(train_graph.predecessors(i)))))
            return sum
            
        else:
            return 0
    except:
        return 0
    

In [54]:
print(calc_adar_in(1,189226))
print(calc_adar_in(669354,1635354))

0
0


<h4>4.4. Is Person Follows back</h4>

In [55]:
def follows_back(a,b):
    if train_graph.has_edge(b,a):
        return 1
    else:
        return 0

In [56]:
print(follows_back(1,189226))
print(follows_back(669354,1635354))
print(follows_back(189,18926))

1
0
0


<h4>4.5.Katz Centrality </h4>
Katz centrality computes the centrality for a node based on the centrality of its neighbors. It is generalization of the eigenvector centrality. The Katz centrality for node i is:
$$x_i = \alpha \sum_{j} A_{ij} x_j + \beta,$$
where `A` is the adjacency matrix of the graph G 
with eigenvalues $\lambda$. The parameter $\beta$ controls the initial centrality and $\alpha < \frac{1}{\lambda_{max}}.$

In [58]:
if not os.path.isfile('data/fea_sample/katz.p'):
    katz=nx.katz.katz_centrality(train_graph, alpha=0.005, beta=1)
    pickle.dump(katz, open('data/fea_sample/katz.p','wb'))
else:
    katz=pickle.load(open('data/fea_sample/katz.p','rb'))

In [59]:
print('min: ', katz[min(katz, key=katz.get)])
print('max: ', katz[max(katz, key=katz.get)])
print('mean: ', float(sum(katz.values())/len(katz)))
mean_katz=float(sum(katz.values())/len(katz))
print(mean_katz)

min:  0.0007313532484065916
max:  0.003394554981699122
mean:  0.0007483800935562018
0.0007483800935562018


<h4>4.5. Hits Score </h4>
This algorithm computes two number for a node. Authority estimates the node value based on the incoming links. Hubs estimates the node values based on the outgoing link.

In [60]:
if not os.path.isfile('data/fea_sample/hits.p'):
    hits=nx.hits(train_graph, max_iter=100, tol=1e-08, nstart=None, normalized=True)
    pickle.dump(hits, open('data/fea_sample/hits.p','wb'))
else:
    hits=pickle.load(open('data/fea_sample/hits.p','rb'))

In [62]:
print('min: ', hits[0][min(hits[0], key=hits[0].get)])
print('max: ', hits[0][max(hits[0], key=hits[0].get)])
print('mean: ', float(sum(hits[0].values())/len(hits[0])))
mean_hits=float(sum(hits[0].values())/len(hits[0]))
print(mean_hits)

min:  0.0
max:  0.004868653378780953
mean:  5.615699699344123e-07
5.615699699344123e-07


<h3>5. Featurizations</h3>

<h4>5.1. Reading a Sample of Data from Both Train and Test </h4>

In [63]:
import randomif os.path.isfile('data/after_eda/train_after_eda.csv'):
    file_name="data/after_eda/train_after_eda.csv"
    n_train=15100028
    s=100000
    skip_train=sorted(random.sample(range(1,n_train+1),n_train-s))


In [66]:
if os.path.isfile('data/after_eda/test_after_eda.csv'):
    file_name="data/after_eda/test_after_eda.csv"
    n_test=3775006
    s=50000
    skip_test=sorted(random.sample(range(1,n_test+1),n_test-s))

In [67]:
print("Number of rows in the train data file: ", n_train)
print("Number of rows we are going to eliminate/skip in train data are: ", len(skip_train))
print("Number of rows in the test data file: ", n_test)
print("Number of rows we are going to eliminate/skip in test data are: ", len(skip_test))

Number of rows in the train data file:  15100028
Number of rows we are going to eliminate/skip in train data are:  15000028
Number of rows in the test data file:  3775006
Number of rows we are going to eliminate/skip in test data are:  3725006


In [69]:
df_final_train=pd.read_csv('data/after_eda/train_after_eda.csv',skiprows=skip_train, names=['source_node','destination_node'])
df_final_train['indicator_link']=pd.read_csv('data/train_y.csv',skiprows=skip_train,names=['indicator_link'])
print("Our train matrix size is: ",df_final_train.shape)
df_final_train.head(2)


Our train matrix size is:  (100002, 3)


Unnamed: 0,source_node,destination_node,indicator_link
0,273084,1505602,1
1,34377,1540570,1


In [70]:
df_final_test=pd.read_csv('data/after_eda/test_after_eda.csv',skiprows=skip_test, names=['source_node','destination_node'])
df_final_test['indicator_link']=pd.read_csv('data/test_y.csv',skiprows=skip_test,names=['indicator_link'])
print("Our train matrix size is: ",df_final_test.shape)
df_final_test.head(2)

Our train matrix size is:  (50002, 3)


Unnamed: 0,source_node,destination_node,indicator_link
0,848424,784690,1
1,109806,1262519,1


<h4>5.2. Adding a set of Features</h4>
Creating following features for train and test data:
<ul>
    <li>jaccard_followers</li>
    <li>jaccard_followees</li>
    <li>cosine_followers</li>
    <li>cosine_followees</li>
    <li>num_followers_s</li>
    <li>num_followees_s</li>
    <li>num_followers_d</li>
    <li>num_followees_d</li>
    <li>inter_followers</li>
    <li>inter_followees</li>
</ul>

In [71]:
if not os.path.isfile('data/fea_sample/storage_sample_stage1.h5'):
    df_final_train['jaccard_followers']=df_final_train.apply(lambda row:
                                            jaccard_for_followers(row['source_node'],row['destination_node']),axis=1)
    df_final_test['jaccard_followers']=df_final_test.apply(lambda row:
                                            jaccard_for_followers(row['source_node'],row['destination_node']),axis=1)
    df_final_train['jaccard_followees']=df_final_train.apply(lambda row:
                                            jaccard_for_followees(row['source_node'],row['destination_node']),axis=1)
    df_final_test['jaccard_followees']=df_final_test.apply(lambda row:
                                            jaccard_for_followees(row['source_node'],row['destination_node']),axis=1)
    df_final_train['cosine_followers']=df_final_train.apply(lambda row:
                                            cosine_for_followers(row['source_node'],row['destination_node']),axis=1)
    df_final_test['cosine_followers']=df_final_test.apply(lambda row:
                                            cosine_for_followers(row['source_node'],row['destination_node']),axis=1)
    df_final_train['cosine_followees']=df_final_train.apply(lambda row:
                                            cosine_for_followees(row['source_node'],row['destination_node']),axis=1)
    df_final_test['cosine_followees']=df_final_test.apply(lambda row:
                                            cosine_for_followees(row['source_node'],row['destination_node']),axis=1)

In [72]:
def compute_features_stage1(df_final):
    #Number of followers and followees for source and dest. node
    #Intersection of followers and followees for source and destination node.
    num_followers_s=[]
    num_followees_s=[]
    num_followers_d=[]
    num_followees_d=[]
    inter_followers=[]
    inter_followees=[]
    for i,row in df_final.iterrows():
        try:
            s1=set(train_graph.predecessors(row['source_node']))
            s2=set(train_graph.successors(row['source_node']))
        except:
            s1=set()
            s2=set()
        try:
            d1=set(train_graph.predecessors(row['destination_node']))
            d2=set(train_graph.successors(row['destination_node']))
        except:
            d1=set()
            d2=set() 
        num_followers_s.append(len(s1))
        num_followees_s.append(len(s2))
        num_followers_d.append(len(d1))
        num_followees_d.append(len(d2))
        inter_followers.append(len(s1.intersection(d1)))
        inter_followees.append(len(s2.intersection(d2)))
        return num_followers_s,num_followees_s,num_followers_d,num_followees_d,inter_followers,inter_followees
        
            
        

In [73]:
if not os.path.isfile('data/fea_sample/storage_sample_stage1.h5'):
    df_final_train[num_followers_s],df_final_train[num_followees_s],\
    df_final_train[num_followers_d],df_final_train[num_followees_d],\
    df_final_train[inter_followers],df_final_train[inter_followees]=compute_features_stage1(df_final_train)
    
    df_final_test[num_followers_s],df_final_test[num_followees_s],\
    df_final_test[num_followers_d],df_final_test[num_followees_d],\
    df_final_test[inter_followers],df_final_test[inter_followees]=compute_features_stage1(df_final_test)
    
    hdf=HDFStore('data/fea_sample/storage_sample_stage1.h5')
    hdf.put('train_df',df_final_train,format='table',data_columns=True)
    hdf.put('test_df',df_final_test,format='table',data_columns=True)
    hdf.close()
else:
    df_final_train=read_hdf('data/fea_sample/storage_sample_stage1.h5','train_df',mode='r')
    df_final_test=read_hdf('data/fea_sample/storage_sample_stage1.h5','test_df',mode='r')
    

<h4>5.3. Adding New Set of Features</h4>
Creating following features for train and test data points:
<ul>
    <li>adar_index</li>
    <li>follows_back</li>
    <li>same_comp</li>
    <li>shortest_path</li>
</ul>

In [74]:
if not os.path.isfile('data/fea_sample/storage_sample_stage2.h5'):
    df_final_train['adar_index']=df_final_train.apply(lambda row:
                                            calc_adar_in(row['source_node'],row['destination_node']),axis=1)
    df_final_test['adar_index']=df_final_test.apply(lambda row:
                                           calc_adar_in(row['source_node'],row['destination_node']),axis=1)
    df_final_train['follows_back']=df_final_train.apply(lambda row:
                                            follows_back(row['source_node'],row['destination_node']),axis=1)
    df_final_test['follows_back']=df_final_test.apply(lambda row:
                                           follows_back(row['source_node'],row['destination_node']),axis=1)
    df_final_train['same_comp']=df_final_train.apply(lambda row:
                                            belongs_to_same_wcc(row['source_node'],row['destination_node']),axis=1)
    df_final_test['same_comp']=df_final_test.apply(lambda row:
                                           belongs_to_same_wcc(row['source_node'],row['destination_node']),axis=1)
    df_final_train['shortest_path']=df_final_train.apply(lambda row:
                                            compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1)
    df_final_test['shortest_path']=df_final_test.apply(lambda row:
                                            compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1)
    
    hdf=HDFStore('data/fea_sample/storage_sample_stage2.h5')
    hdf.put('train_df',df_final_train,format='table',data_columns=True)
    hdf.put('test_df',df_final_test,format='table',data_columns=True)
    hdf.close()
else:
    df_final_train=read_hdf('data/fea_sample/storage_sample_stage2.h5','train_df',mode='r')
    df_final_test=read_hdf('data/fea_sample/storage_sample_stage2.h5','test_df',mode='r')
    
    

<h4>5.4. Adding New set of Features</h4>
Following features are added to the train and test data:
<ul>
    <li>Weight Feature</br>
    <ul>
      <li>Weight of incoming edge</li>
     <li>Weight of outgoing edge</li>
     <li>Weight of incoming edge + Weight of outgoing edge</li>
    <li>Weight of incoming edge * Weight of outgoing edge</li>
    <li>2*Weight of incoming edge + Weight of outgoing edge</li>
    <li>Weight of incoming edge + 2*Weight of outgoing edge</li>
    </ul>
    </li>
    <li>Page Rank</li>
    <li>Katz</li>
    <li>Hubs</li>
    <li>Authorities</li>
</ul>
Weight feature of the node i is given by:
\begin{equation}
W_i = \frac{1}{\sqrt{1+|N|}}
\end{equation}
where N is the number of neighbors of the node i.

In [75]:
weight_in={}
weight_out={}
for i in tqdm(train_graph.nodes()):
    s1=set(train_graph.predecessors(i))
    w_in=1.0/(np.sqrt(1+len(s1)))
    weight_in[i]=w_in
    
    s2=set(train_graph.successors(i))
    w_out=1.0/(np.sqrt(1+len(s2)))
    weight_out[i]=w_out
mean_weight_in=np.mean(list(weight_in.values())) 
mean_weight_out=np.mean(list(weight_out.values()))  
    

100%|██████████████████████████████| 1780722/1780722 [01:35<00:00, 18680.37it/s]


In [76]:
if not os.path.isfile('data/fea_sample/storage_sample_stage3.h5'):
    df_final_train['weight_in']=df_final_train.destination_node.apply(lambda x: weight_in.get(x, mean_weight_in))
    df_final_train['weight_out']=df_final_train.source_node.apply(lambda x: weight_out.get(x, mean_weight_out))
    
    df_final_test['weight_in']=df_final_test.destination_node.apply(lambda x: weight_in.get(x, mean_weight_in))
    df_final_test['weight_out']=df_final_test.source_node.apply(lambda x: weight_out.get(x, mean_weight_out))
    
    #feature engineering on these weight features
    df_final_train[weight_f1]=df_final_train.weight_in + df_final_train.weight_out
    df_final_train[weight_f2]=df_final_train.weight_in * df_final_train.weight_out
    df_final_train[weight_f3]=2*df_final_train.weight_in + df_final_train.weight_out
    df_final_train[weight_f4]=df_final_train.weight_in + 2*df_final_train.weight_out
    
    df_final_test[weight_f1]=df_final_test.weight_in + df_final_test.weight_out
    df_final_test[weight_f2]=df_final_test.weight_in * df_final_test.weight_out
    df_final_test[weight_f3]=2*df_final_test.weight_in + df_final_test.weight_out
    df_final_test[weight_f4]=df_final_test.weight_in + 2*df_final_test.weight_out
    
    

In [77]:
if not os.path.isfile('data/fea_sample/storage_sample_stage3.h5'):
    df_final_train['page_rank_s']=df_final_train.source_node.apply(lambda x: pr.get(x, mean_pr))
    df_final_train['page_rank_d']=df_final_train.destination_node.apply(lambda x: pr.get(x, mean_pr))
     
    df_final_test['page_rank_s']=df_final_test.source_node.apply(lambda x: pr.get(x, mean_pr))
    df_final_test['page_rank_d']=df_final_test.destination_node.apply(lambda x: pr.get(x, mean_pr))
    
    df_final_train['katz_s']=df_final_train.source_node.apply(lambda x: katz.get(x, mean_katz))
    df_final_train['katz_d']=df_final_train.destination_node.apply(lambda x: katz.get(x, mean_katz))
     
    df_final_test['katz_s']=df_final_test.source_node.apply(lambda x: katz.get(x, mean_katz))
    df_final_test['katz_d']=df_final_test.destination_node.apply(lambda x: katz.get(x, mean_katz))
    
    df_final_train['hubs_s']=df_final_train.source_node.apply(lambda x: hits[0].get(x,0))
    df_final_train['hubs_d']=df_final_train.destination_node.apply(lambda x: hits[0].get(x,0))
     
    df_final_test['hubs_s']=df_final_test.source_node.apply(lambda x: hits[0].get(x,0))
    df_final_test['hubs_d']=df_final_test.destination_node.apply(lambda x: hits[0].get(x,0))
    
    df_final_train['authorities_s']=df_final_train.source_node.apply(lambda x: hits[1].get(x,0))
    df_final_train['authorities_d']=df_final_train.destination_node.apply(lambda x: hits[1].get(x,0))
     
    df_final_test['authorities_s']=df_final_test.source_node.apply(lambda x: hits[1].get(x,0))
    df_final_test['authorities_d']=df_final_test.destination_node.apply(lambda x: hits[1].get(x,0))
    
    hdf=HDFStore('data/fea_sample/storage_sample_stage3.h5')
    hdf.put('train_df',df_final_train,format='table',data_columns=True)
    hdf.put('test_df',df_final_test,format='table',data_columns=True)
    hdf.close()
else:
    df_final_train=read_hdf('data/fea_sample/storage_sample_stage3.h5','train_df',mode='r')
    df_final_test=read_hdf('data/fea_sample/storage_sample_stage3.h5','test_df',mode='r')    
    
    

<h4>5.5. Adding Some More Features</h4>
1. SVD for both source and destination 

In [78]:
sadj_col=sorted(train_graph.nodes())
sadj_dict={val:idx for idx, val in enumerate(sadj_col)}
def svd(x, S):
    try:
        z=sadj_dict[x]
        return S[z]
    except:
        return [0,0,0,0,0,0]
    

In [80]:
Adj=nx.adjacency_matrix(train_graph,nodelist=sorted(train_graph.nodes())).asfptype()
U,s,V=svds(Adj,k=6)
print("Adjacency Matrix Shape:",Adj.shape)
print("U Matrix Shape:",U.shape)
print("V Matrix Shape:",V.shape)
print("s Matrix Shape:",s.shape)

Adjacency Matrix Shape: (1780722, 1780722)
U Matrix Shape: (1780722, 6)
V Matrix Shape: (6, 1780722)
s Matrix Shape: (6,)


In [81]:
if not os.path.isfile('data/fea_sample/storage_sample_stage4.h5'):
    df_final_train[['svd_u_s_1','svd_u_s_2','svd_u_s_3','svd_u_s_4','svd_u_s_5','svd_u_s_6']]=\
    df_final_train.source_node.apply(lambda x: svd(x,U)).apply(pd.Series)
    
    df_final_train[['svd_u_d_1','svd_u_d_2','svd_u_d_3','svd_u_d_4','svd_u_d_5','svd_u_d_6']]=\
    df_final_train.destination_node.apply(lambda x: svd(x,U)).apply(pd.Series)
    
    df_final_train[['svd_v_s_1','svd_v_s_2','svd_v_s_3','svd_v_s_4','svd_v_s_5','svd_v_s_6']]=\
    df_final_train.source_node.apply(lambda x: svd(x,V.T)).apply(pd.Series)
    
    df_final_train[['svd_v_d_1','svd_v_d_2','svd_v_d_3','svd_v_d_4','svd_v_d_5','svd_v_d_6']]=\
    df_final_train.destination_node.apply(lambda x: svd(x,V.T)).apply(pd.Series)
    
    df_final_test[['svd_u_s_1','svd_u_s_2','svd_u_s_3','svd_u_s_4','svd_u_s_5','svd_u_s_6']]=\
    df_final_test.source_node.apply(lambda x: svd(x,U)).apply(pd.Series)
    
    df_final_test[['svd_u_d_1','svd_u_d_2','svd_u_d_3','svd_u_d_4','svd_u_d_5','svd_u_d_6']]=\
    df_final_test.destination_node.apply(lambda x: svd(x,U)).apply(pd.Series)
    
    df_final_test[['svd_v_s_1','svd_v_s_2','svd_v_s_3','svd_v_s_4','svd_v_s_5','svd_v_s_6']]=\
    df_final_test.source_node.apply(lambda x: svd(x,V.T)).apply(pd.Series)
    
    df_final_test[['svd_v_d_1','svd_v_d_2','svd_v_d_3','svd_v_d_4','svd_v_d_5','svd_v_d_6']]=\
    df_final_test.destination_node.apply(lambda x: svd(x,V.T)).apply(pd.Series)
    
    hdf=HDFStore('data/fea_sample/storage_sample_stage4.h5')
    hdf.put('train_df',df_final_train,format='table',data_columns=True)
    hdf.put('test_df',df_final_test,format='table',data_columns=True)
    hdf.close()
else:
    df_final_train=read_hdf('data/fea_sample/storage_sample_stage4.h5','train_df',mode='r')
    df_final_test=read_hdf('data/fea_sample/storage_sample_stage4.h5','test_df',mode='r')  