In [1]:
import tensorflow
import stellargraph
import numpy
import matplotlib
import os.path
import pandas
import datetime
import re

Please notice, to open jupyter in custom directory use:
>jupyter notebook --notebook-dir=[your_directory]

Expecting file hierarchy:
    <ul>root/
        <li>Antarctica_research_2020/
            <ul>.git/</ul>
            <ul>.gitignore</ul>
            <ul>readme.md</ul>
            <ul>notebook.ipynb</ul></li>
        <li>data/
            <ul>Sesja 1.xlsx</ul>
            <ul>.</ul>
            <ul>.</ul>
            <ul>Sesja 26.xlsx</ul></li>
    </ul>

In [2]:
# Let me write in advantage examined peoples' occupations:
ppl_professions = {1: "to be filled up",
                  2: "to be filled up",
                  3: "to be filled up",
                  4: "to be filled up",
                  5: "to be filled up",
                  6: "to be filled up",   # UWAGA - nr 6 przybył na stację 25.04.79r
                  7: "to be filled up",
                  8: "to be filled up",
                  9: "to be filled up",
                  10: "to be filled up",
                  11: "to be filled up",
                  12: "to be filled up",
                  13: "to be filled up",
                  14: "to be filled up",
                  15: "to be filled up",
                  16: "to be filled up",
                  17: "to be filled up",
                  18: "to be filled up",
                  19: "to be filled up",
                  20: "to be filled up",
                  21: "to be filled up"
                  }

In [3]:
data_path = "../data/"
file_list = [name for name in os.listdir(data_path)]
print(file_list)

['Sesja 1.xlsx', 'Sesja 10.xlsx', 'Sesja 11.xlsx', 'Sesja 12.xlsx', 'Sesja 13.xlsx', 'Sesja 14.xlsx', 'Sesja 15.xlsx', 'Sesja 16.xlsx', 'Sesja 17.xlsx', 'Sesja 18.xlsx', 'Sesja 19.xlsx', 'Sesja 2.xlsx', 'Sesja 20.xlsx', 'Sesja 21.xlsx', 'Sesja 22.xlsx', 'Sesja 23.xlsx', 'Sesja 24.xlsx', 'Sesja 25.xlsx', 'Sesja 26.xlsx', 'Sesja 3.xlsx', 'Sesja 4.xlsx', 'Sesja 5.xlsx', 'Sesja 6.xlsx', 'Sesja 7.xlsx', 'Sesja 8.xlsx', 'Sesja 9.xlsx']


In [4]:
# Sessions order is important, let me get sorted file info
sorted_filenames = dict()
for name in file_list:
    list_of_digits = list(map(int, filter(str.isdigit, name)))
    sorted_filenames[name] = list_of_digits[0] if len(list_of_digits) == 1 else list_of_digits[1] + list_of_digits[0]*10
sorted_filenames = sorted(sorted_filenames.items(), key=lambda el: el[1])
sorted_filenames = {val: key for key, val in sorted_filenames}
print(sorted_filenames)

{1: 'Sesja 1.xlsx', 2: 'Sesja 2.xlsx', 3: 'Sesja 3.xlsx', 4: 'Sesja 4.xlsx', 5: 'Sesja 5.xlsx', 6: 'Sesja 6.xlsx', 7: 'Sesja 7.xlsx', 8: 'Sesja 8.xlsx', 9: 'Sesja 9.xlsx', 10: 'Sesja 10.xlsx', 11: 'Sesja 11.xlsx', 12: 'Sesja 12.xlsx', 13: 'Sesja 13.xlsx', 14: 'Sesja 14.xlsx', 15: 'Sesja 15.xlsx', 16: 'Sesja 16.xlsx', 17: 'Sesja 17.xlsx', 18: 'Sesja 18.xlsx', 19: 'Sesja 19.xlsx', 20: 'Sesja 20.xlsx', 21: 'Sesja 21.xlsx', 22: 'Sesja 22.xlsx', 23: 'Sesja 23.xlsx', 24: 'Sesja 24.xlsx', 25: 'Sesja 25.xlsx', 26: 'Sesja 26.xlsx'}


In [5]:
# Loading data
data_dict = dict()
for key, val in sorted_filenames.items():
    data_dict[key] = pandas.read_excel(data_path + val, header=None, dtype=object)

In [6]:
rand_int = int(numpy.random.rand() * 26)
data_dict[rand_int].head(4)
# An example:

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,,Pozycja osoby,,,,,,,,,...,N = 20,,,1979-03-12 00:00:00,,,,ZIM6,,
1,Nr O.B.,1,2.0,3.0,4.0,5.0,6,7.0,8.0,9.0,...,12,13.0,14.0,15,16.0,17,18.0,19,20.0,21.0
2,1,2420,2417.0,2430.0,2730.0,3027.0,x,3217.0,2425.0,1423.0,...,1613,2010.0,1832.0,2121,2422.0,x,1628.0,1918,2306.0,2908.0
3,2,2313,2300.0,2509.0,1807.0,1715.0,x,2010.0,2014.0,2619.0,...,2130,3022.0,2814.0,2403,1428.0,2624,2124.0,1723,1222.0,1718.0


The target format for data is a stellargraph graph for each session (session -> data recording occuring every two week). But before that, let me read the data into intermediate objects:

In [7]:
class Session(object):
    def __init__(self, raw_session_data):
        self.date = self.find_date(raw_session_data)
        self.relations = self.get_relations(raw_session_data) # dictionary of lists of tuples - column_number : [(x1, y1), (x2, y2), ...]
    
    def get_relations(self, raw_session_data) -> dict:
        rels = dict()
        col_no = len(ppl_professions)
        rels = {x: [] for x in range(1, col_no + 1)}
        for person in range(2, col_no + 1):
            for entry in range(1, col_no):
                string_value = str(raw_session_data.iloc[person, entry])
                subject_no = raw_session_data.iloc[person, 0]
                data_point = self.tuple_from_string(string_value)
                rels[subject_no].append(data_point)
                
        return rels
    
    def tuple_from_string(self, string) -> tuple:
        if string == 'nan':
            return None
        elif len(string) == 4:
            x = int(string[1]) + int(string[0]) * 10
            y = int(string[3]) + int(string[2]) * 10
            return (x, y)
        elif len(string) == 3:
            x = int(string[0])
            y = int(string[2]) + int(string[1]) * 10
            return (x, y)
        else:
            return None
    
    def find_date(self, raw_session_data) -> str:
        for key, col in raw_session_data.iteritems():
            for val in col:
                regex = re.search(r"\d{1,2}.\d{1,2}.\d{4}|\d{4}.\d{1,2}.\d{1,2}", str(val))
                if regex:
                    return regex.group()
                
    def __str__(self):
        random = int(numpy.random.rand() * len(ppl_professions))
        return "Session from " + self.date + " example entry: " + str(self.relations[random]) + " for " + str(random)

In [9]:
session_data = [Session(e) for e in data_dict.values()]
print(session_data[0])
print(session_data[-1])

Session from 1978-12-29 example entry: [(18, 12), (5, 20), (34, 33), (32, 28), (25, 36), (27, 15), (31, 33), (11, 23), (8, 9), (2, 22), (18, 20), (31, 8), (27, 30), (16, 31), (9, 18), (20, 29), (2, 7), (15, 18), (6, 32), (13, 21)] for 11
Session from 1980-01-10 example entry: [(34, 35), (23, 22), (32, 29), (32, 33), (30, 31), (37, 4), (28, 29), (27, 27), (25, 25), (23, 16), (25, 14), (17, 23), (20, 17), (21, 19), (35, 6), (18, 13), (15, 26), (38, 10), (39, 38), (39, 3)] for 14


In [None]:
class Person(object):
    def __init__(self, person_no, particular_person_profession):
        self.person_no = person_no
        self.profession = particular_person_profession
        self.sessions_dict = self.get_all_relations()
        
    def get_all_relations(self) -> dict:
        pass
    
    def get_all_relative_vectors(self) -> dict:
        pass
    
    def get_all_distances(self) -> dict:
        pass
    
    def get_particular_rel_list(self, other: Person) - list:
        pass
    
    def __str__(self):
        return ""