In [127]:
import pandas as pd
import itertools
import os
from io import StringIO
from csv import writer 
import regex as re


In [2]:
# Load the base data
raw_df = pd.read_pickle('Data_Out/auto_channels_data.pkl')

In [3]:
# Get some summary stats and a set of all nodes that we can refer to
# word_frequency_dict is for later use in making graphs
unique_words_set = set()
word_frequency_dict = {}

for row in raw_df.VIDEO_TAGS:
    for word in row:
        unique_words_set.add(word)
        word_frequency_dict[word] = word_frequency_dict.get(word, 0) + 1
del word_frequency_dict['']
unique_words_set.remove('')

print('unique_words_set: ',len(unique_words_set))
print('word_frequency_dict: ',len(word_frequency_dict))


unique_words_set:  125708
word_frequency_dict:  125708


In [128]:
reg1 = re.compile('[\n]')
nodes = {}
Counter=1
for word in unique_words_set:
    word = re.sub(reg1, '', word)     
    nodes.update({word:Counter})
    Counter+=1
nodes

{'jeepspeed3700': 1,
 'idiot move': 2,
 'maybach 2021 v12': 3,
 '富士急ハイランド': 4,
 'tata nexon car review': 5,
 'オムニボット': 6,
 'noisy brakes': 7,
 'ram limited': 8,
 'Tulip Mania (Event)': 9,
 'Ford Focus RS RX': 10,
 'ford focus st': 11,
 'josh kalis': 12,
 'porschecayenne': 13,
 'who makes the best luxury car': 14,
 'rocky mountain race week 2017': 15,
 'نيسان اكستريل 2018': 16,
 'H Greg': 17,
 'camber alignment': 18,
 'car cleaning tips': 19,
 'Kia Seltos Rear Space check': 20,
 'video_de_caminhao': 21,
 'cutters': 22,
 'kona': 23,
 'شيفروليه كمارو': 24,
 'yezdi scrambler review': 25,
 'заводится': 26,
 'fazilat': 27,
 'koenigsegg agera rs1': 28,
 'crazy guy vs biker': 29,
 'ирпень': 30,
 '2er Coupé': 31,
 'store closing': 32,
 'atees': 33,
 'the Hoonigans': 34,
 'electric harley': 35,
 'supra': 36,
 'car craft week to wicked': 37,
 'Honda mô tô': 38,
 'fiat 500 abarth': 39,
 'corolla EVAP': 40,
 'Rio Haryanto': 41,
 'пины': 42,
 'fwd burnout': 43,
 'JR Garage': 44,
 'brake calipers stu

In [121]:
# If I end up needing a dict that is the reverse of nodes
nodes_reversed = {}
for x,y in nodes.items():
    nodes_reversed.update({y:x})
#nodes_reversed

In [122]:
nodes_reversed[6444]

'P0107B\nhow to'

In [91]:
# function to match the nodes in the edges df to the keys in the nodes dict
def match_nodes(row):
    return nodes[row['source']], nodes[row['target']]

def match_node(node):
    return nodes[node]

In [None]:
'''node_df=pd.DataFrame.from_dict(nodes,orient='index')
node_df.to_pickle('Data_Out/nodes.pkl')
node_df'''

In [11]:
dod = {}
for row in raw_df.VIDEO_TAGS:
    for subset in itertools.combinations(row, 2):
        if subset[0] not in dod:
            dod[subset[0]] = {}
        dod[subset[0]][subset[1]] = dod[subset[0]].get(subset[1], 0) + 1

In [None]:
# This method took 4 HOURS to get to 1,040,886 rows, approximately 1/4 complete
# Realistic to expect 16 hours for full completion!!!

'''edges = pd.DataFrame(columns=['source', 'target', 'weight'])
for word1, word_dict in dod.items():
    for word2, count in word_dict.items():
                edges = edges.append({'source': word1, 'target': word2, 'weight': count}, ignore_index=True)'''

In [132]:
# This method takes ~5 seconds to complete all 3,927,518 rows without the regex
## Here we are creating an in memory csv file to write to, then reading it back into a dataframe
# with the regex it takes ~45 seconds to complete all 3,927,518 rows
# ultimately the regex is necessary because there is at least one row with a newline in the word


output = StringIO()
csv_writer = writer(output)
reg1 = '\n'

for word1, word_dict in dod.items():
    for word2, count in word_dict.items():
        word1 = re.sub(reg1, '', word1)            
        word2 = re.sub(reg1, '', word2)     
        csv_writer.writerow((word1, word2, count))

output.seek(0) # we need to get back to the start of the BytesIO
edges = pd.read_csv(output, header=None, names=['source', 'target', 'weight'])
edges

Unnamed: 0,source,target,weight
0,menstruation,period,1
1,menstruation,menstruation_cup,1
2,menstruation,menstrual_cup,1
3,menstruation,menstrual_cups,1
4,menstruation,reusable_menstrual_products,1
...,...,...,...
3927514,baleno silver,baleno top selling car,1
3927515,baleno silver,baleno premium hatchback,1
3927516,baleno 2019 facelift,baleno top selling car,1
3927517,baleno 2019 facelift,baleno premium hatchback,1


In [134]:
edges.to_pickle('Data_Out/edges.pkl')
edges.to_csv('Data_Out/edges.csv', index=False)

In [138]:
# delete rows with weight > 2
edges_gt_2 = edges[edges.weight > 2].copy()
edges_gt_2.dropna(inplace=True)
print(len(edges)-len(edges_gt_2), ' edges were removed')
print(len(edges_gt_2), ' edges remain')

3556986  edges were removed
370533  edges remain


## OMG its the NWB

In [137]:
# Creating an nwb file
with open("nodes.nwb", 'w') as f:  
    f.write("*Nodes\nid*int label*string\n")
    for key, value in nodes.items():  
        f.write('%s    "%s"\n' % (value, key))
    f.write("*UndirectedEdges\nsource*int target*int weight*int\n")
    for index, row in edges_gt_2.iterrows():
        source, target = match_nodes(row)
        f.write('%s    %s    %s\n' % (source, target, row['weight']))
   