In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import ast
import sqlite3
import json

from pathlib import Path 

In [3]:
# Connect to database sqlite where to store the information regarding the Dribbble dataset.
conn = sqlite3.connect("dribbble_temporary.db")
c = conn.cursor()

# Skills

This file contains information about the **skills** of the Dribbble's users.

The corresponding raw data is the file *skills.txt*.

N.B. The file regarding this information has been obtained by Francesco.

### Raw data visualization

In [4]:
# Let's try to open the dataframe in order to visualize it.
skills = pd.read_csv(Path("raw_data/skills.txt"), sep = "\t", header = None, names = ["username", "info"])
skills

Unnamed: 0,username,info
0,cmaffuccio,[]
1,arestov_design,"['art direction', 'banking', 'cryptocurrency'..."
2,space307,"['illustration', 'motion graphics', 'ui', 'ux..."
3,Hido,[]
4,Avagana,[]
...,...,...
772805,thom_ryan,['hand rendered type design']
772806,mcborden88,[]
772807,JoseRivas,[]
772808,bridgetsandoval,[]


### Unpack information from raw data

In [5]:
skills["info"] = skills["info"].apply(lambda x: x.rstrip().lstrip())
skills["username"] = skills["username"].apply(lambda x: x.rstrip().lstrip())

In [6]:
# Unique users with ascertain no skills.
users_no_skills = pd.Series(skills[skills["info"] == "[]"].username.unique())

In [7]:
users_no_skills

0              cmaffuccio
1                    Hido
2                 Avagana
3              citrusbyte
4               bunninies
               ...       
584337             vovain
584338         mcborden88
584339          JoseRivas
584340    bridgetsandoval
584341           GoodWine
Length: 584342, dtype: object

In [8]:
len(users_no_skills)

584342

Each username has a list containing the information about the skills of the corresponding users. Unfortunately, the *temporal information* on when these skills were earned by users is not present in the data.

In [9]:
# Read the .tsv file with specified chunksize in order to preprocess it not all together.
chunksize = 10000
skills_chunk = pd.read_csv(Path("raw_data/skills.txt"), sep = "\t", names = ["username", "info"], chunksize = chunksize)

In [10]:
chunk_list = list()

# Each chunk is a dataframe.
for i,chunk in enumerate(skills_chunk):
    chunk["info"] = chunk["info"].apply(lambda x: x.rstrip().lstrip())
    chunk["username"] = chunk["username"].apply(lambda x: x.rstrip().lstrip())
    # Remove empty 'info' rows.
    chunk = chunk.loc[~(chunk["info"] == "[]")]
    if not chunk.empty:
        print("Cumulative not empty chunk: %d" % (chunksize*(i+1)))
        # We evaluate each row that contain strings containing Python expressions.
        chunk["info"] = chunk["info"].apply(lambda x: ast.literal_eval(x))
        # Flatten list of list over columns.
        chunk = chunk.explode("info")
        # Rename 'info' column.
        chunk.rename({"info": "skill"}, axis = 1, inplace = True)

        # Save the current chunk of data into list.
        chunk_list.append(chunk)

Cumulative not empty chunk: 10000
Cumulative not empty chunk: 20000
Cumulative not empty chunk: 30000
Cumulative not empty chunk: 40000
Cumulative not empty chunk: 50000
Cumulative not empty chunk: 60000
Cumulative not empty chunk: 70000
Cumulative not empty chunk: 80000
Cumulative not empty chunk: 90000
Cumulative not empty chunk: 100000
Cumulative not empty chunk: 110000
Cumulative not empty chunk: 120000
Cumulative not empty chunk: 130000
Cumulative not empty chunk: 140000
Cumulative not empty chunk: 150000
Cumulative not empty chunk: 160000
Cumulative not empty chunk: 170000
Cumulative not empty chunk: 180000
Cumulative not empty chunk: 190000
Cumulative not empty chunk: 200000
Cumulative not empty chunk: 210000
Cumulative not empty chunk: 220000
Cumulative not empty chunk: 230000
Cumulative not empty chunk: 240000
Cumulative not empty chunk: 250000
Cumulative not empty chunk: 260000
Cumulative not empty chunk: 270000
Cumulative not empty chunk: 280000
Cumulative not empty chunk: 2

In [11]:
# Concatenate all data of the users with at least one skill.
skills = pd.concat(chunk_list, ignore_index = False).reset_index(drop = True)
skills

Unnamed: 0,username,skill
0,arestov_design,art direction
1,arestov_design,banking
2,arestov_design,cryptocurrency
3,arestov_design,design systems
4,arestov_design,fintech
...,...,...
1103610,symple,online marketing
1103611,symple,php
1103612,symple,seo
1103613,symple,wordpress


### Preprocessing

In [12]:
# Load file that maps the multiple usernames of the 'users' table into an unique information.
with open("users_mapper_username.json") as f:
    map_users_multiple_profiles = json.load(f)

In [13]:
# We map the usernames with multiple profiles into 'users' table to have a single value.
# 'username'.
username_remap = skills.username[skills.username.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of 'username' usernames remapped: %d" % len(username_remap))
skills.username.loc[username_remap.index] = username_remap

# Users with no skills.
username_no_skills_remap = users_no_skills[users_no_skills.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of usernames with no skills remapped: %d" % len(username_no_skills_remap))
users_no_skills.loc[username_no_skills_remap.index] = username_no_skills_remap

n° of 'username' usernames remapped: 399
n° of usernames with no skills remapped: 639


In [14]:
# Check duplicated rows.
skills.duplicated().sum()

5161

In [15]:
# Remove duplicated rows.
skills.drop_duplicates(inplace = True)

In [16]:
# Check duplicated rows.
users_no_skills.duplicated().sum()

352

In [17]:
# Remove duplicated rows.
users_no_skills.drop_duplicates(inplace = True)

In [18]:
# Check NaN values.
skills.isna().sum()

username    0
skill       0
dtype: int64

In [19]:
# Check NaN values.
users_no_skills.isna().sum()

0

In [20]:
# Check if there exist an intersection between the users with at least one skill and the users with no skills.
intersection = list(set(skills.username.unique()) & set(users_no_skills))
intersection

['a2d',
 'NEWFLIX',
 'gustavo_meyer',
 'mcampello',
 'julienperriere',
 'Palmi',
 'przemob',
 'primaua',
 'kuon_yagi',
 'Nadezhda_Lebedeva',
 'janrvu',
 'jaynejosie',
 'micrazvandan',
 'JLax86',
 'UIFoxDesign',
 'makulker',
 'noobitter',
 'iamrishishah',
 'sarahschraderdesign',
 'GianlucaDelgadoPires',
 'kpugovkin',
 'Cipomark',
 'iamcelestah',
 'juancamilovilla',
 'davallan',
 'JeroenSchaper',
 'sayed',
 'ATotalBeardo',
 'LukaBliazze',
 'jagnagra',
 'BadTaste_',
 'helloabalone',
 'ufoo',
 'marusha',
 'bellayan',
 'analemos',
 'madebydanno',
 'dannyshaw',
 'sergiulupse',
 'theray',
 'Tka4enko',
 'nnick',
 'yankodesing',
 'dgtlistru',
 'pejcicnikola',
 'olajaszewska',
 'Jocelyn1332',
 'is567',
 'yuzhen01',
 'ali_zareshahi',
 'Kris_Olak',
 'katayo0o0on',
 'stephenandrewmurrill',
 'lizziedesign',
 'sidneykanoti',
 'ingriddesing',
 'sigra',
 'tibidigital',
 'r4vn',
 'efrespmx',
 'J_R_Speed',
 'erikalam',
 'zoll',
 'bishrantt',
 'bfagans',
 'jonway',
 'sweetie',
 'julialitkevich',
 'bonehau

In [21]:
len(intersection)

318

In [22]:
# Not consider these users into the users with no skills.
users_no_skills = users_no_skills.loc[~users_no_skills.isin(intersection)]

In [23]:
# Add to the dataframe the information regarding the users with no skills.
no_skills = pd.DataFrame(columns = skills.columns)
no_skills["username"] = users_no_skills.values
no_skills["username"] = no_skills["username"].apply(lambda x: x.rstrip().lstrip())

skills = pd.concat([skills, no_skills], ignore_index = False).reset_index(drop = True)

In [24]:
skills

Unnamed: 0,username,skill
0,arestov_design,art direction
1,arestov_design,banking
2,arestov_design,cryptocurrency
3,arestov_design,design systems
4,arestov_design,fintech
...,...,...
1682121,vovain,
1682122,mcborden88,
1682123,JoseRivas,
1682124,bridgetsandoval,


In [25]:
len(skills.username.unique())

770821

In [26]:
# Recheck duplicated rows after this remapping.
skills.duplicated().sum()

0

In [27]:
# Save the dataframe into the sql database.
skills.to_sql("skills", conn, index = False, dtype = {"username": "TEXT", "skill": "TEXT"})

In [28]:
conn.close()