#Script to create dummy data that can be used to visualize dashboard instead of sensitive PROSIT data

In [1]:
import pandas as pd
import os
import numpy as np
import datetime
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# ref for Dim reduction
# https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b

In [67]:
datapath = os.path.abspath(os.path.join(os.getcwd(), "../../data"))

participants = [ "dummy00"+str(i+1) for i in range(100) ]

screenFeatures = [ "noScreenLocks", "firstScreenUnlock", "lastScreenLock", "maxScreenTime", "totalScreenTime" ]
callingFeatures = [ "noMissed", "noDialled", "noIncoming",  "minDurationIncoming",  "maxDurationIncoming", "noOutgoing", 
                    "minDurationOutgoing", "maxDurtaionOutgoing", "totalNoCalls", "totalDurationIncoming", "totalDurationOutgoing", "TotalDurationCalls" ]
mobilityFeatures = [ "stepCount", "totalDistance" , "noStayPoints", "locationVariance", "locationEntropy"]
sleepFeatures = [ "sleepStartTime1", "sleepEndTime1", "sleepDuration1", "sleepStartTime2", "sleepEndTime2", "sleepDuration2", "totalSleepDuration", "noSleepInterruptions" ]

featureNames = screenFeatures + callingFeatures + mobilityFeatures + sleepFeatures

<ul> Data files needed
<li> Personality score
<li> Depression Score
<li> Features extracted from Mobile sensor data
<li> Brightness
<li> Acceleromter
<li> Gyroscope

In [68]:
%%time
personalityDummyData = []
for participant in participants:
    personalityDummyData.append(
            [participant] + [np.random.choice(["ios", "android"])] + list(np.random.randint(1, 10, 5)) #random personality scores
        )       

CPU times: user 9.61 ms, sys: 8 µs, total: 9.61 ms
Wall time: 8.63 ms


In [69]:
%%time
featuresDummyData = []
dateBegin = datetime.date(2020, 1, 1)
dateEnd = datetime.date(2020, 12, 1)
brightnessDummyData = []
accelerometerDummyData = []
gyrscopeDummyData = []

for participant in participants:
    randDateNum = np.random.randint((dateEnd - dateBegin).days)
    startDate = dateBegin + datetime.timedelta(days=randDateNum)

    for dateNum in range(30):
        currDate = (startDate + datetime.timedelta(days=dateNum)).strftime("%Y-%m-%d")
        noScreenLocks = np.random.randint(1, 50)    # max 50 screen locks/unlocks
        firstScreenUnlock = np.random.randint(1, 12*60)   # random time from first half of the day
        lastScreenLock = firstScreenUnlock + np.random.randint(12*60, 24*60)  #random time from second half of the day
        maxScreenTime = np.random.randint(1, 3*60)  #3hrs
        totalScreenTime = np.random.randint(maxScreenTime, 12*60)  #half of the day
        noMissed = np.random.randint(1, 10) # max 10 missed calls
        noDialled = np.random.randint(1, 10)    # max 10 dialled calls
        noIncoming = np.random.randint(1, 10)   # max 10 incoming calls
        minDurationIncoming = np.random.randint(1, 10)  #max 10mins
        maxDurationIncoming = np.random.randint(minDurationIncoming, 3*60)  #max 3hrs
        noOutgoing = np.random.randint(1, 10)   # max 10mins
        minDurationOutgoing = np.random.randint(1, 10)  # max 10mins
        maxDurtaionOutgoing = np.random.randint(minDurationOutgoing, 3*60)  # max 3hrs
        totalNoCalls = noMissed + noDialled + noIncoming + noOutgoing
        totalDurationIncoming = np.random.randint(minDurationIncoming, 3*60)
        totalDurationOutgoing = np.random.randint(minDurationOutgoing, 3*60)
        TotalDurationCalls = totalDurationIncoming + totalDurationOutgoing
        stepCount = np.round(np.random.randint(1, 10*1312.33595801), 0)      # 10kms
        totalDistance = np.round(stepCount/1312.33595801, 2)
        noStayPoints = np.random.randint(1, 10)
        locationVariance = np.round(np.random.rand(), 2)
        locationEntropy = np.round(np.random.rand(), 2)
        sleepStartTime1 = np.random.randint(1, 12*60)
        sleepEndTime1 = np.random.randint(sleepStartTime1, sleepStartTime1 + 12*60)
        sleepDuration1 = sleepEndTime1 - sleepStartTime1
        sleepStartTime2 = np.random.randint(sleepEndTime1, sleepEndTime1 + 24*60-sleepDuration1)
        sleepEndTime2 = np.random.randint(sleepStartTime2, sleepStartTime2 + 12*60 -sleepDuration1)
        sleepDuration2 = sleepEndTime2 - sleepStartTime2
        totalSleepDuration = sleepDuration1 + sleepDuration2
        noSleepInterruptions = np.random.randint(1, 10)
        depressionScore = np.random.randint(1, 10)

        featuresDummyData.append(
            [participant, currDate] 
            + [ noScreenLocks,
                firstScreenUnlock,
                lastScreenLock,
                maxScreenTime,
                totalScreenTime,
                noMissed,
                noDialled,
                noIncoming,
                minDurationIncoming,
                maxDurationIncoming,
                noOutgoing,
                minDurationOutgoing,
                maxDurtaionOutgoing,
                totalNoCalls,
                totalDurationIncoming,
                totalDurationOutgoing,
                TotalDurationCalls,
                stepCount,
                totalDistance,
                noStayPoints,
                locationVariance,
                locationEntropy,
                sleepStartTime1,
                sleepEndTime1,
                sleepDuration1,
                sleepStartTime2,
                sleepEndTime2,
                sleepDuration2,
                totalSleepDuration,
                noSleepInterruptions,
                depressionScore ]
            ) 

        for i in range(24*60):
            brightnessDummyData.append(
                [participant, currDate, i, np.round(np.random.random(), 2)] 
            )
            accelerometerDummyData.append(
                [participant, currDate, i, np.round(np.random.random(), 2)] 
            )
            gyrscopeDummyData.append(
                [participant, currDate, i, np.round(np.random.random(), 2)] 
            )

CPU times: user 1min 24s, sys: 528 ms, total: 1min 25s
Wall time: 1min 25s


In [70]:
featuresDummy = pd.DataFrame(data = featuresDummyData, columns = ["participantId", "date"] + featureNames + ["depressionScore"])
personalityScoresDummy = pd.DataFrame(data = personalityDummyData, columns = ["participantId", "device", "open", "con", "agree", "extra", "neuro"])
brightnessDummy = pd.DataFrame(data = brightnessDummyData, columns = ["participantId", "date", "minuteOfTheDay" ,"brt"])
accelerometerDummy = pd.DataFrame(data = accelerometerDummyData, columns = ["participantId", "date", "minuteOfTheDay" ,"acc"])
gyroscopeDummy = pd.DataFrame(data = gyrscopeDummyData, columns = ["participantId", "date", "minuteOfTheDay" ,"gyr"])

In [71]:
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(personalityScoresDummy.iloc[:,2:])
# personalityScoresDummy['x0'] = pca_result[:,0]
# personalityScoresDummy['x1'] = pca_result[:,1]

In [72]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_result = tsne.fit_transform(personalityScoresDummy.iloc[:,2:])
personalityScoresDummy['x0'] = tsne_result[:,0]
personalityScoresDummy['x1'] = tsne_result[:,1]

[t-SNE] Computing 99 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.000s...
[t-SNE] Computed neighbors for 100 samples in 0.004s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 3.999590
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.220734
[t-SNE] KL divergence after 300 iterations: 0.673920


In [73]:
personalityScoresDummy.to_csv(os.path.join(datapath, "dummyPersonalityScores"), index=False, header=True)
featuresDummy.to_csv(os.path.join(datapath, "dummyFeatureData"), index=False, header=True)
brightnessDummy.to_csv(os.path.join(datapath, "dummyBrightness"), index=False, header=True)
accelerometerDummy.to_csv(os.path.join(datapath, "dummyAccelerometer"), index=False, header=True)
gyroscopeDummy.to_csv(os.path.join(datapath, "dummyGyroscope"), index=False, header=True)

In [65]:
personalityScoresDummy1 = pd.read_csv(os.path.join(datapath, "dummyPersonalityScores"))
featuresDummy1 = pd.read_csv(os.path.join(datapath, "dummyFeatureData"))
brightnessDummy1 = pd.read_csv(os.path.join(datapath, "dummyBrightness"))
featuresDummy1 = pd.read_csv(os.path.join(datapath, "dummyAccelerometer"))
featuresDummy1 = pd.read_csv(os.path.join(datapath, "dummyGyroscope"))

In [22]:
df = pd.read_csv(os.path.join("../../data", "dummyFeatureData"))

In [26]:
df.groupby("participantId").mean().reset_index()

Unnamed: 0,participantId,noScreenLocks,firstScreenUnlock,lastScreenLock,maxScreenTime,totalScreenTime,noMissed,noDialled,noIncoming,minDurationIncoming,...,locationEntropy,sleepStartTime1,sleepEndTime1,sleepDuration1,sleepStartTime2,sleepEndTime2,sleepDuration2,totalSleepDuration,noSleepInterruptions,depressionScore
0,dummy001,23.166667,398.933333,1493.433333,86.000000,429.166667,4.733333,5.633333,5.066667,5.433333,...,0.514000,304.700000,671.066667,366.366667,1214.866667,1385.366667,170.500000,536.866667,5.033333,5.333333
1,dummy0010,21.900000,308.966667,1373.533333,83.700000,411.566667,4.700000,5.033333,4.800000,4.966667,...,0.447000,334.366667,634.100000,299.733333,1187.333333,1399.500000,212.166667,511.900000,5.233333,5.200000
2,dummy00100,24.600000,349.933333,1415.033333,98.133333,407.466667,5.266667,4.766667,5.033333,5.033333,...,0.605667,372.700000,744.900000,372.200000,1215.900000,1432.633333,216.733333,588.933333,4.533333,4.900000
3,dummy0011,28.466667,368.633333,1467.166667,106.600000,373.700000,4.266667,4.966667,4.133333,4.833333,...,0.550333,365.500000,776.866667,411.366667,1336.533333,1537.100000,200.566667,611.933333,5.066667,4.700000
4,dummy0012,28.900000,285.733333,1409.566667,94.600000,397.000000,4.800000,4.866667,4.733333,5.000000,...,0.472000,426.866667,785.166667,358.300000,1295.466667,1493.400000,197.933333,556.233333,5.100000,4.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,dummy0095,25.433333,339.900000,1478.400000,96.566667,429.733333,5.633333,5.233333,4.966667,5.066667,...,0.550667,355.533333,718.366667,362.833333,1279.900000,1460.366667,180.466667,543.300000,4.966667,5.266667
96,dummy0096,25.233333,391.366667,1491.800000,92.066667,366.666667,4.933333,4.966667,4.933333,5.533333,...,0.495667,324.266667,669.300000,345.033333,1183.700000,1410.533333,226.833333,571.866667,5.066667,5.466667
97,dummy0097,25.533333,397.533333,1536.733333,89.433333,432.566667,5.100000,5.300000,4.366667,4.766667,...,0.423667,400.066667,755.466667,355.400000,1327.433333,1525.166667,197.733333,553.133333,6.000000,4.933333
98,dummy0098,20.266667,426.933333,1494.600000,99.266667,387.266667,4.866667,5.800000,5.566667,4.700000,...,0.494667,398.200000,731.133333,332.933333,1170.366667,1315.466667,145.100000,478.033333,4.400000,4.400000


In [21]:
df.to_csv(os.path.join("../../data", "dummyFeatureData"), index=False)