In [2]:
import pandas as pd
import itertools
import os
from io import StringIO
from csv import writer 
import regex as re


In [3]:
# Load in the data for working with. 
# This only loads the automotive channels due to time limitations in the project.
# Future progress would be to finish collecting and processing all categories, then combine them into one large file before this part.
raw_df = pd.read_pickle('Data_Out/auto_channels_data.pkl')

In [4]:
# Get some summary stats and a set of all nodes that we can refer to
# word_frequency_dict is for later use in making graphs
unique_words_set = set()
word_frequency_dict = {}

for row in raw_df.VIDEO_TAGS:
    for word in row:
        unique_words_set.add(word)
        word_frequency_dict[word] = word_frequency_dict.get(word, 0) + 1
del word_frequency_dict['']
unique_words_set.remove('')

print('unique_words_set: ',len(unique_words_set))
print('word_frequency_dict: ',len(word_frequency_dict))


unique_words_set:  125708
word_frequency_dict:  125708


In [5]:
# Create a Dictionary of all nodes
# Keys are the nodes
# Values are an incrementally assigned ID number for use in our NWB file
reg1 = re.compile('[\n]') # There is at least one tag with a newline in it. 'P0107B\nhow to'
nodes = {}
Counter=1

for word in unique_words_set:
    word = re.sub(reg1, '', word)     
    nodes.update({word:Counter})
    Counter+=1
nodes

{'beschleunigungsmonster': 1,
 'fastest car 2020': 2,
 'coloridas': 3,
 'aventador 2020': 4,
 'bentley bentayga 2020 review': 5,
 '2004 Toyota RAV4': 6,
 'mach 1 mustang': 7,
 'minecraft_surgeon': 8,
 'jp camaro': 9,
 'Laguna Seca': 10,
 '488_pista_drag_race': 11,
 'в топе': 12,
 'GAD-Motors': 13,
 'grip mustang eleanor gegen sls amg': 14,
 'all_wheel_drive_bmw': 15,
 'a-class sedan': 16,
 'nagmani pane ka tarika': 17,
 'smart controller': 18,
 '2022 lexus nx': 19,
 'best of rotary sounds': 20,
 'siezed brake drums': 21,
 'Toyota Camry.': 22,
 'Maschio': 23,
 'ls600hl': 24,
 'audi_sedan': 25,
 'bumpy road': 26,
 'nitrous fire': 27,
 'máy chạy tại nhà': 28,
 'Akrapovic Exhaust': 29,
 'tata mexon bs6': 30,
 'droomrit voor het leven': 31,
 'WOT': 32,
 'Quester': 33,
 'creta 2020 india accident': 34,
 '2019 mercedes cls review': 35,
 'Maruti Scross accident': 36,
 'Chevrolet Duramax': 37,
 'dino': 38,
 'best SUV': 39,
 'Arnold Schwarzenegger': 40,
 'new maserati': 41,
 'coolant reservoir t

In [121]:
# If I end up needing a dict that is the reverse of nodes
nodes_reversed = {}
for x,y in nodes.items():
    nodes_reversed.update({y:x})
#nodes_reversed

In [122]:
nodes_reversed[6444]

'P0107B\nhow to'

In [11]:
# function to match the nodes in the edges df to the keys in the nodes dict
def match_nodes(row):
    return nodes[row['source']], nodes[row['target']]

def match_node(node):
    return nodes[node]

In [6]:
dod = {}
for row in raw_df.VIDEO_TAGS:
    for subset in itertools.combinations(row, 2):
        if subset[0] not in dod:
            dod[subset[0]] = {}
        dod[subset[0]][subset[1]] = dod[subset[0]].get(subset[1], 0) + 1

In [None]:
# This method took 4 HOURS to get to 1,040,886 rows, approximately 1/4 complete
# Realistic to expect 16 hours for full completion!!!
# Only left this in for posterity

'''edges = pd.DataFrame(columns=['source', 'target', 'weight'])
for word1, word_dict in dod.items():
    for word2, count in word_dict.items():
                edges = edges.append({'source': word1, 'target': word2, 'weight': count}, ignore_index=True)'''

In [7]:
# Here we are creating an in memory csv file to write to, then reading it back into a dataframe
# This method takes ~5 seconds to complete all 3,927,518 rows

output = StringIO()
csv_writer = writer(output)

for word1, word_dict in dod.items():
    for word2, count in word_dict.items():    
        csv_writer.writerow((word1, word2, count))

output.seek(0) # we need to get back to the start of the BytesIO
edges = pd.read_csv(output, header=None, names=['source', 'target', 'weight'])
edges

Unnamed: 0,source,target,weight
0,menstruation,period,1
1,menstruation,menstruation_cup,1
2,menstruation,menstrual_cup,1
3,menstruation,menstrual_cups,1
4,menstruation,reusable_menstrual_products,1
...,...,...,...
3927514,baleno silver,baleno top selling car,1
3927515,baleno silver,baleno premium hatchback,1
3927516,baleno 2019 facelift,baleno top selling car,1
3927517,baleno 2019 facelift,baleno premium hatchback,1


In [134]:
edges.to_pickle('Data_Out/edges.pkl')
edges.to_csv('Data_Out/edges.csv', index=False)

In [9]:
# delete rows with weight <= 2
edges_gt_2 = edges[edges.weight > 2].copy()
edges_gt_2.dropna(inplace=True)
print(len(edges)-len(edges_gt_2), ' edges were removed')
print(len(edges_gt_2), ' edges remain')

3556986  edges were removed
370533  edges remain


## Export network graph to NWB format
This is basically justa .txt file with specific formatting and the extension changed to .nwb  
The file can be opened in a text editor to get a better idea of the formatting.  
The next step with this file is to load it into Sci2 and utilize the tools there for refining and visualizing the graph.

If you are on a Mac, then you cannot install Sci2, you will need to do one of the following:
1. Load a windows/linux VM and install Sci2
2. Install windows with bootcamp and install Sci2
3. Install Docker and run the container here: 
https://github.com/CIShell/sci2-docker-vnc

In [13]:
# Creating an nwb file
with open("Data_Out/graph.nwb", 'w') as f:  
    f.write("*Nodes\nid*int label*string\n")
    for key, value in nodes.items():  
        f.write('%s    "%s"\n' % (value, key))
    f.write("*UndirectedEdges\nsource*int target*int weight*int\n")
    for index, row in edges_gt_2.iterrows():
        source, target = match_nodes(row)
        f.write('%s    %s    %s\n' % (source, target, row['weight']))
   