In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import dask.dataframe as dd
import pandas as pd
import json
import sqlite3

from pathlib import Path

In [3]:
# Connect to database sqlite where to store the information regarding the Dribbble dataset.
conn = sqlite3.connect("dribbble_temporary.db")
c = conn.cursor()

# Followers

This file contains information about the **followers** and **followings** of Dribbble's users. 

The corresponding raw data is the file *followers.tsv*.

### Raw data visualization

In [4]:
# Let's try to open the dataframe in order to visualize it.
followers = dd.read_csv(Path("raw_data/followers.tsv"), sep = "\t", header = None, names = ["destination", "info"])

In [5]:
followers.head()

Unnamed: 0,destination,info
0,marklamb,"[[1511756060, ""cmaffuccio""], [1511432388, ""are..."
1,hrvoje-grubisic,"[[1513098860, ""taylor-creative""], [1513095452,..."
2,quaintinc,"[[1513044377, ""Lanaya66""], [1512963960, ""trist..."
3,hugowelke,"[[1498405006, ""bleaning""], [1467312933, ""Krisp..."
4,giantghost,"[[1452303125, ""joaquinlluis""], [1438591511, ""s..."


In [6]:
followers.tail()

Unnamed: 0,destination,info
150865,david_hildie,"[[1476211260, ""groupondesignunion""], [14714393..."
150866,abairagi,"[[1512653185, ""shirinkhara""], [1512131050, ""ab..."
150867,ioanacioc,"[[1511122049, ""alexandraandronache""], [1510048..."
150868,upensh,"[[1513650428, ""andreahock""], [1513571262, ""Arv..."
150869,vasstudio,"[[1514870580, ""manojrajput""], [1514542912, ""as..."


In [7]:
# Get the number of rows of the dataframe.
followers.shape[0].compute()

772074

Each destination (username) has a list of lists. Each list contains a pair of information: the time (Unix Time) at which the follow action takes place and the following user.

### Unpack information from raw data

In [8]:
# Read the large .tsv file with specified chunksize in order to preprocess it not all together.
chunksize = 10000
followers_chunk = pd.read_csv(Path("raw_data/followers.tsv"), sep = "\t", names = ["destination", "info"], chunksize = chunksize)

In [9]:
chunk_list = list()
users_no_followers = list()

# Each chunk is a dataframe.
for i,chunk in enumerate(followers_chunk):
    # Store information about users with no followers. 
    users_no_followers.append(chunk.loc[(chunk["info"] == "[]")]["destination"].values)
    # Remove empty 'info' rows.
    chunk = chunk.loc[~(chunk["info"] == "[]")]
    if not chunk.empty:
        print("Cumulative not empty chunk: %d" % (chunksize*(i+1)))
        # We evaluate each row that contain strings containing Python expressions.
        chunk["info"] = chunk["info"].apply(lambda x: json.loads(x))
        # Flatten list of list over columns.
        chunk = chunk.explode("info")
        # Extract all the information from this chunk of data.
        chunk = pd.DataFrame(chunk["info"].tolist(), index = chunk["destination"], columns = ["created_at", "source"]).reset_index()

        # Save the current chunk of data into list.
        chunk_list.append(chunk)

Cumulative not empty chunk: 10000
Cumulative not empty chunk: 20000
Cumulative not empty chunk: 30000
Cumulative not empty chunk: 40000
Cumulative not empty chunk: 50000
Cumulative not empty chunk: 60000
Cumulative not empty chunk: 70000
Cumulative not empty chunk: 80000
Cumulative not empty chunk: 320000
Cumulative not empty chunk: 330000
Cumulative not empty chunk: 340000
Cumulative not empty chunk: 370000
Cumulative not empty chunk: 380000
Cumulative not empty chunk: 410000
Cumulative not empty chunk: 430000
Cumulative not empty chunk: 440000
Cumulative not empty chunk: 450000
Cumulative not empty chunk: 460000
Cumulative not empty chunk: 490000
Cumulative not empty chunk: 500000
Cumulative not empty chunk: 510000
Cumulative not empty chunk: 520000
Cumulative not empty chunk: 530000
Cumulative not empty chunk: 540000
Cumulative not empty chunk: 550000
Cumulative not empty chunk: 570000
Cumulative not empty chunk: 580000
Cumulative not empty chunk: 590000
Cumulative not empty chunk: 

In [10]:
# Concatenate all data of the users with at least one follower.
followers = pd.concat(chunk_list, ignore_index = False).reset_index(drop = True)
followers.head()

Unnamed: 0,destination,created_at,source
0,marklamb,1511756060,cmaffuccio
1,marklamb,1511432388,arestov_design
2,marklamb,1511368675,space307
3,marklamb,1511081409,Hido
4,marklamb,1511065426,Avagana


### Preprocessing

In [11]:
# Load file that maps the multiple usernames of the 'users' table into an unique information.
with open("users_mapper_username.json") as f:
    map_users_multiple_profiles = json.load(f)

In [12]:
# Flatten list of users with no followers.
users_no_followers = pd.Series(users_no_followers).explode().reset_index(drop = True)

In [13]:
# We map the usernames with multiple profiles into 'users' table to have a single value.
# 'destination'.
destination_remap = followers.destination[followers.destination.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of 'destination' usernames remapped: %d" % len(destination_remap))
followers.destination.loc[destination_remap.index] = destination_remap

# 'source'.
source_remap = followers.source[followers.source.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of 'source' usernames remapped: %d" % len(source_remap))
followers.source.loc[source_remap.index] = source_remap

# Users with no followers.
users_no_followers_remap = users_no_followers[users_no_followers.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of usernames with no followers remapped: %d" % len(users_no_followers_remap))
users_no_followers.loc[users_no_followers_remap.index] = users_no_followers_remap

n° of 'destination' usernames remapped: 56958
n° of 'source' usernames remapped: 42148
n° of usernames with no followers remapped: 371


In [14]:
# Check duplicated rows.
followers.duplicated().sum()

326534

In [15]:
# Remove duplicated rows.
followers.drop_duplicates(inplace = True)

In [16]:
# Check duplicated rows.
users_no_followers.duplicated().sum()

962

In [17]:
# Remove duplicated rows.
users_no_followers.drop_duplicates(inplace = True)

In [18]:
# Check if there are some NaN users ('destination' and 'source').
followers.isna().sum()

destination    2
created_at     0
source         0
dtype: int64

In [19]:
# Remove rows with NaN values.
followers.dropna(axis = 0, how = "any", inplace = True)

In [20]:
# Check if there are some NaN users.
users_no_followers.isna().sum()

1

In [21]:
users_no_followers.dropna(inplace = True)

In [22]:
# Check if there exist an intersection between the users with at least one follower and the users with no followers.
intersection = list(set(followers.destination.unique()) & set(users_no_followers))
intersection

['olemgruber',
 'zvonimircamber',
 'origomez',
 'Knightly',
 'Seven_Song',
 'mirakyns',
 'Palmi',
 'claudiamorales',
 'Shawn_',
 'm4rp79',
 'lekeojo',
 'YoussefEmadEldin',
 'DingLiu',
 'anggapermana',
 'toyfu',
 'Snow911',
 'advarto',
 'DianaGoh',
 'im_abhishekp',
 'mattculbertson',
 'przemob',
 'james-the-designer',
 'MatadorianGray',
 'Ryan77',
 'MoshiMoshiMedia',
 'Anuki',
 'seahuang',
 'ShinDoeun',
 'sarahtrad',
 'larsroed',
 'scottxchilds',
 'xavcz',
 'Sharon_HXQ',
 'Muppets',
 'MeiruiDesign',
 'julialitkevich',
 'Mahui',
 'AyoJon',
 'reallyaditya',
 'borodach',
 'HishamTourbak',
 'sunny-vision',
 'Sukilam',
 'ivv',
 'LaisyWang0912',
 'albertosoft',
 'Thelittleeverything',
 'Gxing',
 'Mansion_Cai',
 'Mister_Undying',
 'isaaclemon',
 'evavidovic_',
 'J_R_Speed',
 'theray',
 'dellfi',
 'lukas-nkz',
 'NANCYGONG',
 'zainalow',
 'BleedingEyes',
 'zhenyabelan',
 'mahmoudemara',
 'chinran',
 'uxwis',
 'ahmedsayed',
 'Lvan_13',
 'joshbaptista',
 'primaua',
 'sharminshima',
 'Leo_deisgn',


In [23]:
len(intersection)

90

In [24]:
# Not consider these users into the users with no followers.
users_no_followers = users_no_followers.loc[~users_no_followers.isin(intersection)]

In [25]:
list(set(followers.destination.unique()) & set(users_no_followers))

[]

In [26]:
# Check if the key 'destination' + 'source' is unique.
followers.set_index(["destination", "source"], inplace = True)

In [27]:
followers.index.is_unique

False

In [28]:
not_unique = followers.index.value_counts()
# The number of couple users with multiple contacts.
print(sum(not_unique > 1))

not_unique = not_unique[not_unique > 1]

not_unique = followers.loc[not_unique.index]
not_unique.head()

24


Unnamed: 0_level_0,Unnamed: 1_level_0,created_at
destination,source,Unnamed: 2_level_1
miindofchris,shakuro,1512382855
miindofchris,shakuro,1514547494
Gushn,BadTaste_,1513431494
Gushn,BadTaste_,1514261805
Gis1on,Bill_Miao,1509611585


In [29]:
followers.reset_index(inplace = True)

In [30]:
# Let's keep the rows less updated regarding 'created_at'.
followers = followers[followers["created_at"] == followers.groupby(["destination", "source"])["created_at"].transform("min")]
followers.head()

Unnamed: 0,destination,source,created_at
0,marklamb,cmaffuccio,1511756060
1,marklamb,arestov_design,1511432388
2,marklamb,space307,1511368675
3,marklamb,Hido,1511081409
4,marklamb,Avagana,1511065426


In [31]:
followers.set_index(["destination", "source"]).index.is_unique

True

In [32]:
len(followers)

22033777

In [33]:
# Add to the dataframe the information regarding the users with no followers.
no_followers = pd.DataFrame(columns = followers.columns)
no_followers["destination"] = users_no_followers.values

followers = pd.concat([followers, no_followers], ignore_index = False).reset_index(drop = True)

In [34]:
# Convert to datetime UTC.
# We decide to not save the UTC offset (+00:00).
followers.created_at = pd.to_datetime(followers.created_at, unit = "s", utc = True).dt.tz_convert(None)

In [35]:
followers

Unnamed: 0,destination,source,created_at
0,marklamb,cmaffuccio,2017-11-27 04:14:20
1,marklamb,arestov_design,2017-11-23 10:19:48
2,marklamb,space307,2017-11-22 16:37:55
3,marklamb,Hido,2017-11-19 08:50:09
4,marklamb,Avagana,2017-11-19 04:23:46
...,...,...,...
22660838,ecorreia__,,NaT
22660839,pkam,,NaT
22660840,manish_minglani,,NaT
22660841,Brandydy,,NaT


In [36]:
followers.destination.isna().sum()

0

In [37]:
# Check if users into 'source' are always contained into 'destination'.
set(followers.source.dropna().unique()).issubset(set(followers.destination.unique()))

False

In [38]:
# Check users into 'source' that not have any information in 'destination'.
difference = list(set(followers.source.dropna().unique()) - set(followers.destination.unique()))
difference

['13932306898yan',
 'leonardofaria',
 'kruttika',
 'sajawalacademy',
 'NA',
 'LeJit',
 'Devi30',
 'l792836',
 'BenH',
 'hollowsphere',
 'jordanbvidrine',
 'nan',
 'null',
 'CarlaBarahona',
 '2is0']

In [39]:
# Keep only users from 'source' that are also in 'destination'.
followers = followers[(followers.source.isin(followers.destination.unique())) | (followers.source.isna())]

In [40]:
# Check if users into 'source' are always contained into 'destination'.
set(followers.source.dropna().unique()).issubset(set(followers.destination.unique()))

True

In [41]:
len(followers)

22660662

In [42]:
# Check duplicated rows.
followers.duplicated().sum()

0

In [43]:
followers.destination.isna().sum()

0

In [44]:
followers.source.isna().sum()

627066

In [45]:
len(followers.destination.unique())

770767

In [46]:
# Save the dataframe into the sql database.
followers.to_sql("followers", conn, index = False, dtype = {"destination": "TEXT", "created_at": "TIMESTAMP", "source": "TEXT"})

In [47]:
conn.close()