In [5]:
# Config related imports
import config
from config import load_users_dataframe
unique_users = load_users_dataframe()
from config import load_networkx_friends
network_friends = load_networkx_friends()
from config import dump_ml_data

# Other imports
from tqdm import tqdm
import pandas as pd
import numpy as np

Loading data file from path /Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/users.dat
'Loaded 5011 entires'
Loading data file from path /Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/networkx_friends.dat
'Loaded 5011 entires'


In [2]:
# Settings
intervals = [60, 30, 15, 7]
parametersVector = ["tCurrent", "kIn", "kOut0", "t0", "kOut-1", "t-1", "kOutMax", "kOutMin", "kOutAverage", "tAverage", "nNodes", "label"]

# Calculating in and out degrees
nodeInDegreeDict = network_friends.in_degree()
nodeOutDegreeDict = network_friends.out_degree()

# Self defined function for mean value calculation
def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

In [26]:
def process_data(interval):
    # Prepare dataframe
    processed_dataframe = pd.DataFrame(columns=parametersVector)
    # Order the columns
    processed_dataframe = processed_dataframe[parametersVector]

    with tqdm(total=len(list(unique_users.iterrows()))) as pbar: 
        for index, user_row in unique_users.iterrows():
            if user_row['source_index'] is not None:
                source_candidates = user_row['source_candidates']
                source_first = source_candidates[0]
                source_first_time_lapsed = unique_users.iloc[source_first].time_lapsed
                start_bar = int(source_first_time_lapsed / interval) + 1

                label = []
                t0 = []
                t_1 = []
                kIn = []
                kOut0 = []
                kOut_1 = []
                kOutMax = []
                kOutMin = []
                kOutAverage = []
                nNodes = []
                tAverage = []

                bars = list(np.arange(start_bar * interval, 24 * 60, interval))

                for current_time in bars:
                    # all sources up to the current time
                    sources = [x for x in source_candidates if unique_users.iloc[x].time_lapsed <= current_time]
                    degreeList = [nodeOutDegreeDict[x] for x in sources]
                    timeList = [current_time - unique_users.iloc[x].time_lapsed for x in sources]

                    label.append(int(current_time >= user_row['time_lapsed']))
                    t0.append(round(timeList[0], 1))
                    t_1.append(round(timeList[-1], 1))
                    kIn.append(nodeInDegreeDict[index])
                    kOut0.append(nodeOutDegreeDict[sources[0]])
                    kOut_1.append(nodeOutDegreeDict[sources[-1]])
                    kOutMax.append(max(degreeList))
                    kOutMin.append(min(degreeList))
                    kOutAverage.append(round(mean(degreeList), 1))
                    nNodes.append(len(sources))
                    tAverage.append(round(mean(timeList), 1))

                processed_dataframe = processed_dataframe.append(pd.DataFrame({
                    'label': label,
                    'tCurrent': bars,
                    't0': t0,
                    't-1': t_1,
                    'kIn': kIn,
                    'kOut0': kOut0,
                    'kOut-1': kOut_1,
                    'kOutMax': kOutMax,
                    'kOutMin': kOutMin,
                    'kOutAverage': kOutAverage,
                    'nNodes': nNodes,
                    'tAverage': tAverage
                    }))
            pbar.update(1)
    return processed_dataframe

for interval in intervals:
    df = process_data(interval)
    print(df.head())
    dump_ml_data(df, interval)

100%|██████████| 5011/5011 [07:03<00:00,  8.49it/s]


  kIn kOut-1 kOut0  kOutAverage kOutMax kOutMin label nNodes    t-1     t0  \
0   1    183   183        183.0     183     183     1      1   60.0   60.0   
1   1    183   183        183.0     183     183     1      1  120.0  120.0   
2   1    183   183        183.0     183     183     1      1  180.0  180.0   
3   1    183   183        183.0     183     183     1      1  240.0  240.0   
4   1    183   183        183.0     183     183     1      1  300.0  300.0   

   tAverage tCurrent  
0      60.0       60  
1     120.0      120  
2     180.0      180  
3     240.0      240  
4     300.0      300  
Dumping data to path /Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/60_data.dat
('Finished dumping data to path '
 '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/60_data.dat')


100%|██████████| 5011/5011 [14:01<00:00,  4.46it/s]


  kIn kOut-1 kOut0  kOutAverage kOutMax kOutMin label nNodes    t-1     t0  \
0   1    183   183        183.0     183     183     1      1   30.0   30.0   
1   1    183   183        183.0     183     183     1      1   60.0   60.0   
2   1    183   183        183.0     183     183     1      1   90.0   90.0   
3   1    183   183        183.0     183     183     1      1  120.0  120.0   
4   1    183   183        183.0     183     183     1      1  150.0  150.0   

   tAverage tCurrent  
0      30.0       30  
1      60.0       60  
2      90.0       90  
3     120.0      120  
4     150.0      150  
Dumping data to path /Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/30_data.dat
('Finished dumping data to path '
 '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/30_data.dat')


100%|██████████| 5011/5011 [27:37<00:00,  1.83it/s]


  kIn kOut-1 kOut0  kOutAverage kOutMax kOutMin label nNodes   t-1    t0  \
0   1    183   183        183.0     183     183     1      1  15.0  15.0   
1   1    183   183        183.0     183     183     1      1  30.0  30.0   
2   1    183   183        183.0     183     183     1      1  45.0  45.0   
3   1    183   183        183.0     183     183     1      1  60.0  60.0   
4   1    183   183        183.0     183     183     1      1  75.0  75.0   

   tAverage tCurrent  
0      15.0       15  
1      30.0       30  
2      45.0       45  
3      60.0       60  
4      75.0       75  
Dumping data to path /Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/15_data.dat
('Finished dumping data to path '
 '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/15_data.dat')


100%|██████████| 5011/5011 [56:58<00:00,  1.06s/it]


  kIn kOut-1 kOut0  kOutAverage kOutMax kOutMin label nNodes   t-1    t0  \
0   1    183   183        183.0     183     183     1      1   7.0   7.0   
1   1    183   183        183.0     183     183     1      1  14.0  14.0   
2   1    183   183        183.0     183     183     1      1  21.0  21.0   
3   1    183   183        183.0     183     183     1      1  28.0  28.0   
4   1    183   183        183.0     183     183     1      1  35.0  35.0   

   tAverage tCurrent  
0       7.0        7  
1      14.0       14  
2      21.0       21  
3      28.0       28  
4      35.0       35  
Dumping data to path /Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/7_data.dat
('Finished dumping data to path '
 '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/7_data.dat')
