#Script to create dummy data that can be used to visualize dashboard instead of sensitive PROSIT data

In [34]:
import pandas as pd
import os
import numpy as np
import datetime
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# ref for Dim reduction
# https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b

In [35]:
datapath = os.path.abspath(os.path.join(os.getcwd(), "../../data"))

participants = [ "dummy00"+str(i+1) for i in range(100) ]

screenFeatures = [ "noScreenLocks", "firstScreenUnlock", "lastScreenLock", "maxScreenTime", "totalScreenTime" ]
callingFeatures = [ "noMissed", "noDialled", "noIncoming",  "minDurationIncoming",  "maxDurationIncoming", "totalDurationIncoming", "noOutgoing", 
                    "minDurationOutgoing", "maxDurtaionOutgoing", "totalDurationOutgoing", "totalNoCalls", "totalDurationCalls" ]
mobilityFeatures = [ "stepCount", "totalDistance" , "noStayPoints", "locationVariance", "locationEntropy"]
sleepFeatures = [ "sleepStartTime1", "sleepEndTime1", "sleepDuration1", "sleepStartTime2", "sleepEndTime2", "sleepDuration2", "totalSleepDuration", "noSleepInterruptions" ]

featureNames = screenFeatures + callingFeatures + mobilityFeatures + sleepFeatures

<ul> Data files needed
<li> Personality score
<li> Depression Score
<li> Features extracted from Mobile sensor data
<li> Brightness
<li> Acceleromter
<li> Gyroscope

In [36]:
%%time
personalityDummyData = []
for participant in participants:
    personalityDummyData.append(
            [participant] + [np.random.choice(["ios", "android"])] + list(np.random.randint(1, 10, 5)) #random personality scores
        )

CPU times: user 8.78 ms, sys: 11 µs, total: 8.79 ms
Wall time: 7.9 ms


In [37]:
%%time
featuresDummyData = []
dateBegin = datetime.date(2020, 1, 1)
dateEnd = datetime.date(2020, 12, 1)
brightnessDummyData = []
accelerometerDummyData = []
gyrscopeDummyData = []

for participant in participants:
    randDateNum = np.random.randint((dateEnd - dateBegin).days)
    startDate = dateBegin + datetime.timedelta(days=randDateNum)

    for dateNum in range(30):
        currDate = (startDate + datetime.timedelta(days=dateNum)).strftime("%Y-%m-%d")
        noScreenLocks = np.random.randint(1, 50)    # max 50 screen locks/unlocks
        firstScreenUnlock = np.random.randint(1, 12*60)   # random time from first half of the day
        lastScreenLock = firstScreenUnlock + np.random.randint(12*60, 24*60)  #random time from second half of the day
        maxScreenTime = np.random.randint(1, 3*60)  #3hrs
        totalScreenTime = np.random.randint(maxScreenTime, 12*60)  #half of the day
        noMissed = np.random.randint(1, 10) # max 10 missed calls
        noDialled = np.random.randint(1, 10)    # max 10 dialled calls
        noIncoming = np.random.randint(1, 10)   # max 10 incoming calls
        minDurationIncoming = np.random.randint(1, 10)  #max 10mins
        maxDurationIncoming = np.random.randint(minDurationIncoming, 3*60)  #max 3hrs
        totalDurationIncoming = np.random.randint(minDurationIncoming, 3*60)
        noOutgoing = np.random.randint(1, 10)   # max 10mins
        minDurationOutgoing = np.random.randint(1, 10)  # max 10mins
        maxDurtaionOutgoing = np.random.randint(minDurationOutgoing, 3*60)  # max 3hrs
        totalDurationOutgoing = np.random.randint(minDurationOutgoing, 3*60)
        totalNoCalls = noMissed + noDialled + noIncoming + noOutgoing
        totalDurationCalls = totalDurationIncoming + totalDurationOutgoing
        stepCount = np.round(np.random.randint(1, 10*1312.33595801), 0)      # 10kms
        totalDistance = np.round(stepCount/1312.33595801, 2)
        noStayPoints = np.random.randint(1, 10)
        locationVariance = np.round(np.random.rand(), 2)
        locationEntropy = np.round(np.random.rand(), 2)
        sleepStartTime1 = np.random.randint(1, 12*60)
        sleepEndTime1 = np.random.randint(sleepStartTime1, sleepStartTime1 + 12*60)
        sleepDuration1 = sleepEndTime1 - sleepStartTime1
        sleepStartTime2 = np.random.randint(sleepEndTime1, sleepEndTime1 + 24*60-sleepDuration1)
        sleepEndTime2 = np.random.randint(sleepStartTime2, sleepStartTime2 + 12*60 -sleepDuration1)
        sleepDuration2 = sleepEndTime2 - sleepStartTime2
        totalSleepDuration = sleepDuration1 + sleepDuration2
        noSleepInterruptions = np.random.randint(1, 10)
        depressionScore = np.random.randint(1, 10)

        featuresDummyData.append(
            [participant, currDate] 
            + [ noScreenLocks,
                firstScreenUnlock,
                lastScreenLock,
                maxScreenTime,
                totalScreenTime,
                noMissed,
                noDialled,
                noIncoming,
                minDurationIncoming,
                maxDurationIncoming,
                noOutgoing,
                minDurationOutgoing,
                maxDurtaionOutgoing,
                totalNoCalls,
                totalDurationIncoming,
                totalDurationOutgoing,
                totalDurationCalls,
                stepCount,
                totalDistance,
                noStayPoints,
                locationVariance,
                locationEntropy,
                sleepStartTime1,
                sleepEndTime1,
                sleepDuration1,
                sleepStartTime2,
                sleepEndTime2,
                sleepDuration2,
                totalSleepDuration,
                noSleepInterruptions,
                depressionScore ]
            ) 

        for i in range(24*60):
            brightnessDummyData.append(
                [participant, currDate, i, np.round(np.random.random(), 2)] 
            )
            accelerometerDummyData.append(
                [participant, currDate, i, np.round(np.random.random(), 2)] 
            )
            gyrscopeDummyData.append(
                [participant, currDate, i, np.round(np.random.random(), 2)] 
            )

CPU times: user 1min 25s, sys: 360 ms, total: 1min 25s
Wall time: 1min 25s


In [38]:
featuresDummy = pd.DataFrame(data = featuresDummyData, columns = ["participantId", "date"] + featureNames + ["depressionScore"])
personalityScoresDummy = pd.DataFrame(data = personalityDummyData, columns = ["participantId", "device", "open", "con", "agree", "extra", "neuro"])
brightnessDummy = pd.DataFrame(data = brightnessDummyData, columns = ["participantId", "date", "minuteOfTheDay" ,"brt"])
accelerometerDummy = pd.DataFrame(data = accelerometerDummyData, columns = ["participantId", "date", "minuteOfTheDay" ,"acc"])
gyroscopeDummy = pd.DataFrame(data = gyrscopeDummyData, columns = ["participantId", "date", "minuteOfTheDay" ,"gyr"])

In [39]:
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(personalityScoresDummy.iloc[:,2:])
# personalityScoresDummy['x0'] = pca_result[:,0]
# personalityScoresDummy['x1'] = pca_result[:,1]

In [40]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_result = tsne.fit_transform(personalityScoresDummy.iloc[:,2:])
personalityScoresDummy['x'] = tsne_result[:,0]
personalityScoresDummy['y'] = tsne_result[:,1]

[t-SNE] Computing 99 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.000s...
[t-SNE] Computed neighbors for 100 samples in 0.004s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 3.894831
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.148167
[t-SNE] KL divergence after 300 iterations: 0.648216


In [41]:
personalityScoresDummy.to_csv(os.path.join(datapath, "dummyPersonalityScores"), index=False, header=True)
featuresDummy.to_csv(os.path.join(datapath, "dummyFeatureData"), index=False, header=True)
brightnessDummy.to_csv(os.path.join(datapath, "dummyBrightness"), index=False, header=True)
accelerometerDummy.to_csv(os.path.join(datapath, "dummyAccelerometer"), index=False, header=True)
gyroscopeDummy.to_csv(os.path.join(datapath, "dummyGyroscope"), index=False, header=True)

In [14]:
personalityScoresDummy1 = pd.read_csv(os.path.join(datapath, "dummyPersonalityScores"))
featuresDummy1 = pd.read_csv(os.path.join(datapath, "dummyFeatureData"))
brightnessDummy1 = pd.read_csv(os.path.join(datapath, "dummyBrightness"))
featuresDummy1 = pd.read_csv(os.path.join(datapath, "dummyAccelerometer"))
featuresDummy1 = pd.read_csv(os.path.join(datapath, "dummyGyroscope"))

In [22]:
df = pd.read_csv(os.path.join("../../data", "dummyFeatureData"))

In [17]:
print(brightnessDummy1.info)

<bound method DataFrame.info of          participantId        date  minuteOfTheDay   brt
0             dummy001  2020-01-08               0  0.40
1             dummy001  2020-01-08               1  0.38
2             dummy001  2020-01-08               2  0.72
3             dummy001  2020-01-08               3  0.10
4             dummy001  2020-01-08               4  0.16
...                ...         ...             ...   ...
25919995    dummy00600  2020-07-29            1435  0.80
25919996    dummy00600  2020-07-29            1436  0.99
25919997    dummy00600  2020-07-29            1437  0.80
25919998    dummy00600  2020-07-29            1438  0.01
25919999    dummy00600  2020-07-29            1439  0.97

[25920000 rows x 4 columns]>


In [21]:
df.to_csv(os.path.join("../../data", "dummyFeatureData"), index=False)