In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import dask.dataframe as dd
import pandas as pd
import json
import sqlite3

from pathlib import Path 

In [3]:
# Connect to database sqlite where to store the information regarding the Dribbble dataset.
conn = sqlite3.connect("dribbble_temporary.db")
c = conn.cursor()

# Comments

This file contains information about the **comments** on the shots (posts) of Dribbble. 

The corresponding raw data is the file *comments.tsv*.

### Raw data visualization

In [4]:
# Let's try to open the dataframe in order to visualize it.
comments = dd.read_csv(Path("raw_data/comments.tsv"), sep = "\t", header = None, names = ["shot_id", "info"])

In [5]:
comments.head()

Unnamed: 0,shot_id,info
0,26,[]
1,30,[]
2,35,[]
3,49,[]
4,51,[]


In [6]:
comments.tail()

Unnamed: 0,shot_id,info
114,1109343,"[[2641668, 1370933862, 1370933862, ""jakubspitz..."
115,550172,"[[1330211, 1336471790, 1336471790, ""gadzhi"", ""..."
116,561492,"[[1358483, 1337102790, 1337102790, ""jon_patter..."
117,77356,"[[199460, 1289403530, 1289403530, ""ThisIsKonra..."
118,1489592,"[[3501565, 1396369121, 1396369121, ""roguepixl""..."


In [7]:
# Get the number of rows of the dataframe.
comments.shape[0].compute()

2483477

Each shot has a list of lists. Each list contains the information of the ID of the comment, the date of publication of the comment (Unix Time), the date the comment was modified last time (Unix Time), the author of the comment, the content of the comment and the amount of likes received.

### Unpack information from raw data

In [8]:
# Read the large .tsv file with specified chunksize in order to preprocess it not all together.
chunksize = 10000
comments_chunk = pd.read_csv(Path("raw_data/comments.tsv"), sep = "\t", names = ["shot_id", "info"], chunksize = chunksize)

In [9]:
chunk_list = list()
shots_no_comments = list()

# Each chunk is a dataframe.
for i,chunk in enumerate(comments_chunk):
    # Store information about shots with no comments. 
    shots_no_comments.append(chunk.loc[(chunk["info"] == "[]")]["shot_id"].values)
    # Remove empty 'info' rows.
    chunk = chunk.loc[~(chunk["info"] == "[]")]
    if not chunk.empty:
        print("Cumulative not empty chunk: %d" % (chunksize*(i+1)))
        # We evaluate each row that contain strings containing Python expressions.
        chunk["info"] = chunk["info"].apply(lambda x: json.loads(x))
        # Flatten list of list over columns.
        chunk = chunk.explode("info")
        # Extract all the information from this chunk of data.
        chunk = pd.DataFrame(chunk["info"].tolist(), index = chunk["shot_id"], 
                             columns = ["comment_id", "created_at", "updated_at", "author_comment", "comment", "likes_count"]).reset_index()
        
        # Save the current chunk of data into list.
        chunk_list.append(chunk)

Cumulative not empty chunk: 1370000
Cumulative not empty chunk: 1380000
Cumulative not empty chunk: 1390000
Cumulative not empty chunk: 1400000
Cumulative not empty chunk: 1410000
Cumulative not empty chunk: 1420000
Cumulative not empty chunk: 1430000
Cumulative not empty chunk: 1440000
Cumulative not empty chunk: 1450000
Cumulative not empty chunk: 1460000
Cumulative not empty chunk: 1470000
Cumulative not empty chunk: 1480000
Cumulative not empty chunk: 1490000
Cumulative not empty chunk: 1500000
Cumulative not empty chunk: 1510000
Cumulative not empty chunk: 1520000
Cumulative not empty chunk: 1530000
Cumulative not empty chunk: 1540000
Cumulative not empty chunk: 1550000
Cumulative not empty chunk: 1560000
Cumulative not empty chunk: 1570000
Cumulative not empty chunk: 1580000
Cumulative not empty chunk: 1590000
Cumulative not empty chunk: 1600000
Cumulative not empty chunk: 1610000
Cumulative not empty chunk: 1620000
Cumulative not empty chunk: 1630000
Cumulative not empty chunk: 

In [10]:
# Concatenate all data of shots with at least one comment.
comments = pd.concat(chunk_list, ignore_index = False).reset_index(drop = True)

In [11]:
comments.head()

Unnamed: 0,shot_id,comment_id,created_at,updated_at,author_comment,comment,likes_count
0,295643,738715,1318942035,1318942035,liammckay,<p>Clever stuff!</p>,1
1,1696672,3994002,1412889025,1412889025,chadfullerton,<p>Love this. Great work!</p>,1
2,1696672,3994160,1412899630,1412899630,sammer,"<p>Thanks <a href=""https://dribbble.com/18675""...",0
3,1691148,3889476,1408474083,1408474083,jalenconner,<p>really diggin' the three columns. Are you d...,1
4,1691148,3891158,1408542426,1408542426,arnaudschlupp,"<p><a href=""https://dribbble.com/357364"">@Jale...",1


### Preprocessing

In [12]:
# Load file that maps the multiple usernames of the 'users' table into an unique information.
with open("users_mapper_username.json") as f:
    map_users_multiple_profiles = json.load(f)

In [13]:
# We map the usernames with multiple profiles into 'users' table to have a single value.
# 'author_comment'
author_map = comments.author_comment[comments.author_comment.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of 'author_comment' usernames remapped: %d" % len(author_map))
comments.author_comment.loc[author_map.index] = author_map

n° of 'author_comment' usernames remapped: 1515


In [14]:
# Check duplicated rows.
comments.duplicated().sum()

0

In [15]:
# Check NaN values.
comments.isna().sum()

shot_id           0
comment_id        0
created_at        0
updated_at        0
author_comment    0
comment           0
likes_count       0
dtype: int64

In [16]:
# Flatten list of shot with no comments.
shots_no_comments = pd.Series(shots_no_comments).explode().reset_index(drop = True)

In [17]:
# Check if there are some NaN shots IDs into the shots with no shots.
shots_no_comments.isna().sum()

24

In [18]:
shots_no_comments.dropna(inplace = True)

In [19]:
shots_no_comments.duplicated().sum()

0

In [20]:
shots_no_comments = shots_no_comments.astype(int)

In [21]:
# Check if there exist an intersection between the shot IDs with at least one comment and the shot IDs with no comments.
list(set(comments.shot_id.unique()) & set(shots_no_comments))

[]

In [22]:
# Add to the dataframe the information regarding the shot IDs with no comments.
no_comments = pd.DataFrame(columns = comments.columns)
no_comments["shot_id"] = shots_no_comments.values

comments = pd.concat([comments, no_comments], ignore_index = False).reset_index(drop = True)

In [23]:
# Convert to datetime UTC.
# We decide to not save the UTC offset (+00:00).
comments.created_at = pd.to_datetime(comments.created_at, unit = "s", utc = True).dt.tz_convert(None)
comments.updated_at = pd.to_datetime(comments.updated_at, unit = "s", utc = True).dt.tz_convert(None)

In [24]:
# Check if the 'updated_at' feature is always greater in time than 'created_at' feature.
timediff = comments.updated_at - comments.created_at
sum(timediff < pd.Timedelta(0))

0

In [25]:
comments

Unnamed: 0,shot_id,comment_id,created_at,updated_at,author_comment,comment,likes_count
0,295643,738715,2011-10-18 12:47:15,2011-10-18 12:47:15,liammckay,<p>Clever stuff!</p>,1
1,1696672,3994002,2014-10-09 21:10:25,2014-10-09 21:10:25,chadfullerton,<p>Love this. Great work!</p>,1
2,1696672,3994160,2014-10-10 00:07:10,2014-10-10 00:07:10,sammer,"<p>Thanks <a href=""https://dribbble.com/18675""...",0
3,1691148,3889476,2014-08-19 18:48:03,2014-08-19 18:48:03,jalenconner,<p>really diggin' the three columns. Are you d...,1
4,1691148,3891158,2014-08-20 13:47:06,2014-08-20 13:47:06,arnaudschlupp,"<p><a href=""https://dribbble.com/357364"">@Jale...",1
...,...,...,...,...,...,...,...
6276987,3044495,,NaT,NaT,,,
6276988,3651916,,NaT,NaT,,,
6276989,1580545,,NaT,NaT,,,
6276990,3151559,,NaT,NaT,,,


In [26]:
len(comments)

6276992

In [27]:
# The unique key of the table.
comments.set_index(["shot_id", "comment_id"]).index.is_unique

True

In [28]:
comments.duplicated().sum()

0

In [29]:
comments.shot_id.unique()

array([ 295643, 1696672, 1691148, ..., 1580545, 3151559, 3844445],
      dtype=int64)

In [30]:
comments.shot_id.isna().sum()

0

In [31]:
comments.columns

Index(['shot_id', 'comment_id', 'created_at', 'updated_at', 'author_comment',
       'comment', 'likes_count'],
      dtype='object')

In [32]:
# Save the type for the sqlite table.
dtype = {"shot_id": "INT", "comment_id": "INT", "created_at": "TIMESTAMP", "updated_at": "TIMESTAMP", "author_comment": "TEXT", 
         "comment": "TEXT", "likes_count": "INT"}

In [33]:
# Save the dataframe into the sql database.
comments.to_sql("comments", conn, index = False, dtype = dtype)

In [34]:
conn.close()