In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import json
import sqlite3

from pathlib import Path 

In [3]:
# Connect to database sqlite where to store the information regarding the Dribbble dataset.
conn = sqlite3.connect("dribbble_temporary.db")
c = conn.cursor()

# Likes

This file contains information about the **likes** related only to the shots (posts). The information regarding the likes on the comments is available in the 'comments' table (it is available only the information regarding the amount of likes received from the comments).

The corresponding raw data is the file *likes.tsv*.

### Raw data visualization

In [4]:
# Let's try to open the dataframe in order to visualize it.
likes = dd.read_csv(Path("raw_data/likes.tsv"), sep = "\t", header = None, names = ["shot_id", "info"])

In [5]:
likes.head()

Unnamed: 0,shot_id,info
0,345360,[]
1,587156,[]
2,1590969,[]
3,1786592,[]
4,1217250,[]


In [6]:
likes.tail()

Unnamed: 0,shot_id,info
6041,4057382,"[[106071677, 1514878296, ""zhaoxiangan""], [1060..."
6042,4045712,"[[105678380, 1514323145, ""khatib""], [105651598..."
6043,4057420,"[[106134902, 1514915239, ""IvanNikolow""], [1061..."
6044,4049411,"[[105915954, 1514564126, ""CauMardegan""], [1058..."
6045,4057416,"[[106076513, 1514881164, ""_philmillward""], [10..."


In [7]:
# Get the number of rows of the dataframe.
likes.shape[0].compute()

2484405

Each shot has a list of lists. Each list contains information about the id of the like, the time (Unix Time) in which the like was left and the username of the user who left the like, respectively.

### Unpack information from raw data

In [8]:
# Create the likes table into database sqlite.
conn.execute("CREATE TABLE likes (shot_id INT, like_id INT, created_at_unix INT, author_like TEXT)");

In [9]:
# Load file that maps the multiple usernames of the 'users' table into an unique information.
with open("users_mapper_username.json") as f:
    map_users_multiple_profiles = json.load(f)

In [10]:
# Read the large .tsv file with specified chunksize in order to preprocess it not all together.
chunksize = 10000
likes_chunk = pd.read_csv(Path("raw_data/likes.tsv"), sep = "\t", names = ["shot_id", "info"], chunksize = chunksize)

In [11]:
shots_no_likes = list()

# Each chunk is a dataframe.
for i,chunk in enumerate(likes_chunk):
    # Store information about shots with no likes. 
    shots_no_likes.append(chunk.loc[(chunk["info"] == "[]")]["shot_id"].values)
    # Remove empty 'info' rows.
    chunk = chunk.loc[~(chunk["info"] == "[]")]
    if not chunk.empty:
        print("Cumulative not empty chunk: %d" % (chunksize*(i+1)))
        # Read parse a string as a list of lists.
        chunk["info"] = chunk["info"].apply(lambda x: json.loads(x))
        # Flatten list of list over columns.
        chunk = chunk.explode("info")
        # We get the all the data information into list of lists (shot_id, like_id, created_at_unix, author_like).
        data_to_insert = chunk.shot_id.map(lambda x: [x]) + chunk["info"]
        data_to_insert = np.array(data_to_insert.tolist(), dtype = object)

        # We map the usernames with multiple profiles into 'users' table to have a single value.
        def map_func(val, dictionary):
            return dictionary[val] if val in dictionary else val 
        vfunc  = np.vectorize(map_func)
        data_to_insert[:, 3] = vfunc(data_to_insert[:, 3], map_users_multiple_profiles)

        # Insert data into sql table recursively.
        c.executemany("INSERT INTO likes (shot_id, like_id, created_at_unix, author_like) VALUES (?, ?, ?, ?)", data_to_insert)
        conn.commit()

Cumulative not empty chunk: 30000
Cumulative not empty chunk: 40000
Cumulative not empty chunk: 50000
Cumulative not empty chunk: 60000
Cumulative not empty chunk: 70000
Cumulative not empty chunk: 80000
Cumulative not empty chunk: 90000
Cumulative not empty chunk: 100000
Cumulative not empty chunk: 110000
Cumulative not empty chunk: 120000
Cumulative not empty chunk: 130000
Cumulative not empty chunk: 140000
Cumulative not empty chunk: 150000
Cumulative not empty chunk: 160000
Cumulative not empty chunk: 170000
Cumulative not empty chunk: 180000
Cumulative not empty chunk: 190000
Cumulative not empty chunk: 200000
Cumulative not empty chunk: 210000
Cumulative not empty chunk: 220000
Cumulative not empty chunk: 230000
Cumulative not empty chunk: 240000
Cumulative not empty chunk: 250000
Cumulative not empty chunk: 260000
Cumulative not empty chunk: 270000
Cumulative not empty chunk: 280000
Cumulative not empty chunk: 290000
Cumulative not empty chunk: 300000
Cumulative not empty chunk:

Cumulative not empty chunk: 2340000
Cumulative not empty chunk: 2350000
Cumulative not empty chunk: 2360000
Cumulative not empty chunk: 2370000
Cumulative not empty chunk: 2380000
Cumulative not empty chunk: 2390000
Cumulative not empty chunk: 2400000
Cumulative not empty chunk: 2410000
Cumulative not empty chunk: 2420000
Cumulative not empty chunk: 2430000
Cumulative not empty chunk: 2440000
Cumulative not empty chunk: 2450000
Cumulative not empty chunk: 2460000
Cumulative not empty chunk: 2470000
Cumulative not empty chunk: 2480000
Cumulative not empty chunk: 2490000


In [12]:
# Flatten list of shots with no likes.
shots_no_likes = pd.Series(shots_no_likes).explode().reset_index(drop = True)

### Preprocessing

In [13]:
# Check if there are some NaN.
shots_no_likes.isna().sum()

240

In [14]:
shots_no_likes.dropna(inplace = True)

In [15]:
shots_no_likes.duplicated().sum()

0

In [16]:
shots_no_likes = shots_no_likes.astype(int)

In [17]:
len(shots_no_likes)

23874

In [18]:
likes = pd.read_sql("SELECT shot_id FROM likes", conn)

In [19]:
# Check if there exist an intersection between the shot IDs with at least one like and the shot IDs with no likes.
list(set(likes.shot_id.unique()) & set(shots_no_likes))

[]

In [18]:
# Remove duplicated rows (slow statement).
c.execute("""DELETE FROM likes WHERE rowid NOT IN (SELECT max(rowid) FROM likes GROUP BY like_id)""")
conn.commit()

In [19]:
# Set the right datetime format from Unix Time to UTC creating a new column.
c.execute("""ALTER TABLE likes ADD COLUMN created_at TIMESTAMP""")
c.execute("""UPDATE likes SET created_at = created_at_unix""")
c.execute("""UPDATE likes SET created_at = datetime(created_at_unix, 'unixepoch')""")
conn.commit()

In [20]:
# Read the large file with specified chunksize in order to simply fast visualize it.
chunksize = 10000
likes_chunk = pd.read_sql("SELECT * FROM likes", conn, chunksize = chunksize)

In [21]:
next(likes_chunk)

Unnamed: 0,shot_id,like_id,created_at_unix,author_like,created_at
0,2686725,100555000,1509635279,sexysev,2017-11-02 15:07:59
1,2686725,99914321,1509022719,KseniaProkopova,2017-10-26 12:58:39
2,2686725,97342964,1506537160,citrusbyte,2017-09-27 18:32:40
3,2686725,94318870,1503522249,zapadenko,2017-08-23 21:04:09
4,2686725,73483218,1481441298,xt0rted,2016-12-11 07:28:18
...,...,...,...,...,...
9995,3246627,82762150,1491629036,divanraj,2017-04-08 05:23:56
9996,3246627,82652073,1491495650,brycejacobson,2017-04-06 16:20:50
9997,3246627,82645905,1491490544,florencechevalier,2017-04-06 14:55:44
9998,3246627,81909285,1490728814,krasotin,2017-03-28 19:20:14


In [22]:
# Add to the dataframe the information regarding the shot IDs with no likes.
no_likes = pd.DataFrame(columns = next(likes_chunk).columns)
no_likes["shot_id"] = shots_no_likes

data_to_insert = no_likes.values.tolist()

# Insert data into sql table recursively.
c.executemany("INSERT INTO likes (shot_id, like_id, created_at_unix, author_like, created_at) VALUES (?, ?, ?, ?, ?)", data_to_insert)
conn.commit()

In [23]:
likes_chunk.close()

In [26]:
# Read 'likes' table.
likes = pd.read_sql("SELECT shot_id, like_id FROM likes", conn)
likes.head()

Unnamed: 0,shot_id,like_id
0,2686725,100555000.0
1,2686725,99914321.0
2,2686725,97342964.0
3,2686725,94318870.0
4,2686725,73483218.0


In [27]:
likes.set_index(["shot_id", "like_id"]).index.is_unique

True

In [24]:
c.execute("VACUUM"); # This command allows to reaggange database on small data size especially if you have carried out deletions.

In [25]:
conn.close()