In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import dask.dataframe as dd
import pandas as pd
import json
import sqlite3

from pathlib import Path 

In [3]:
pd.set_option("max_colwidth", 30)

In [4]:
# Connect to database sqlite where to store the information regarding the Dribbble dataset.
conn = sqlite3.connect("dribbble_temporary.db")
c = conn.cursor()

# Shots

This file contains information about the **shots** (posts) published by the users on Dribbble. 

The corresponding raw data is the file *shots.tsv*.

### Raw data visualization

In [5]:
# Let's try to open the dataframe in order to visualize it.
shots = dd.read_csv(Path("raw_data/shots.tsv"), sep = "\t", header = None, names = ["author_shot", "info"])

In [6]:
shots.head()

Unnamed: 0,author_shot,info
0,cmaffuccio,[]
1,Hido,[]
2,Avagana,[]
3,prem_kumar,[]
4,bunninies,[]


In [7]:
shots.tail()

Unnamed: 0,author_shot,info
135867,Shylash,"[{""height"": 300, ""comments..."
135868,upensh,"[{""height"": 300, ""comments..."
135869,JeroenSchaper,"[{""height"": 300, ""comments..."
135870,barrios_tony,"[{""height"": 300, ""comments..."
135871,vasstudio,"[{""height"": 300, ""comments..."


In [8]:
# Get the number of rows of the dataframe.
shots.shape[0].compute()

771264

Each author has a list of dictionaries. Each dictionary contains the information related to a shot published by the author.

### Unpack information from raw data

In [9]:
# Read the large .tsv file with specified chunksize in order to preprocess it not all together.
chunksize = 10000
shots_chunk = pd.read_csv(Path("raw_data/shots.tsv"), sep = "\t", names = ["author_shot", "info"], chunksize = chunksize)

In [10]:
chunk_list = list()
authors_no_shots = list()

# Each chunk is a dataframe.
for i,chunk in enumerate(shots_chunk):
    # Store information about authors with no shots. 
    authors_no_shots.append(chunk.loc[(chunk["info"] == "[]")]["author_shot"].values)
    # Remove empty 'info' rows.
    chunk = chunk.loc[~(chunk["info"] == "[]")]
    if not chunk.empty:
        print("Cumulative not empty chunk: %d" % (chunksize*(i+1)))
        # We evaluate each row that contain strings containing Python expressions.
        chunk["info"] = chunk["info"].apply(lambda x: json.loads(x))
        # Flatten list of list over columns.
        chunk = chunk.explode("info")
        # Store the information regarding the authors of the shots.
        authors = chunk["author_shot"]
        # Extract all the information from this chunk of data.
        chunk = pd.json_normalize(chunk["info"])
        # Reassign the author feature.
        chunk["author_shot"] = authors.values

        # Save the current chunk of data into list.
        chunk_list.append(chunk)

Cumulative not empty chunk: 530000
Cumulative not empty chunk: 540000
Cumulative not empty chunk: 550000
Cumulative not empty chunk: 560000
Cumulative not empty chunk: 570000
Cumulative not empty chunk: 580000
Cumulative not empty chunk: 590000
Cumulative not empty chunk: 600000
Cumulative not empty chunk: 610000
Cumulative not empty chunk: 620000
Cumulative not empty chunk: 630000
Cumulative not empty chunk: 640000
Cumulative not empty chunk: 650000
Cumulative not empty chunk: 670000
Cumulative not empty chunk: 780000


In [11]:
# Concatenate all data of authors with at least one shot published.
shots = pd.concat(chunk_list, ignore_index = False).reset_index(drop = True)

In [12]:
shots.columns

Index(['id', 'title', 'description', 'width', 'height', 'views_count',
       'likes_count', 'comments_count', 'attachments_count', 'rebounds_count',
       'buckets_count', 'created_at', 'updated_at', 'html_url',
       'attachments_url', 'buckets_url', 'comments_url', 'likes_url',
       'projects_url', 'rebounds_url', 'animated', 'tags', 'team',
       'images.hidpi', 'images.normal', 'images.teaser', 'team.id',
       'team.name', 'team.username', 'team.html_url', 'team.avatar_url',
       'team.bio', 'team.location', 'team.links.web', 'team.links.twitter',
       'team.buckets_count', 'team.comments_received_count',
       'team.followers_count', 'team.followings_count', 'team.likes_count',
       'team.likes_received_count', 'team.projects_count',
       'team.rebounds_received_count', 'team.shots_count',
       'team.can_upload_shot', 'team.type', 'team.pro', 'team.buckets_url',
       'team.followers_url', 'team.following_url', 'team.likes_url',
       'team.projects_url', 'tea

In [13]:
# Replace '.' character in all column names with '_'.
shots.columns = shots.columns.str.replace(".", "_")

### Preprocessing

In [14]:
# Load file that maps the multiple usernames of the 'users' table into an unique information.
with open("users_mapper_username.json") as f:
    map_users_multiple_profiles = json.load(f)

In [15]:
# Flatten list of authors with no shots.
authors_no_shots = pd.Series(authors_no_shots).explode().reset_index(drop = True)

In [16]:
# We map the usernames with multiple profiles into 'users' table to have a single value.
# 'author'.
author_map = shots.author_shot[shots.author_shot.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of 'author_shot' usernames remapped: %d" % len(author_map))
shots.author_shot.loc[author_map.index] = author_map

# 'team_username'.
team_username_map = shots.team_username[shots.team_username.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of 'team_username' usernames remapped: %d" % len(team_username_map))
shots.team_username.loc[team_username_map.index] = team_username_map

# Authors with no shots.
author_map_no_shots = authors_no_shots[authors_no_shots.isin(map_users_multiple_profiles.keys())].map(map_users_multiple_profiles, na_action = "ignore")
print("n° of 'author no shots' usernames remapped: %d" % len(author_map_no_shots))
authors_no_shots.loc[author_map_no_shots.index] = author_map_no_shots

n° of 'author_shot' usernames remapped: 4559
n° of 'team_username' usernames remapped: 270
n° of 'author no shots' usernames remapped: 440


In [17]:
# We can't check the duplicated rows of 'shots' table due to 'tags' feature that contains lists.

In [18]:
# Check if there are some NaN authors into the authors with at least one shot published.
shots.author_shot.isna().sum()

0

In [19]:
# Check duplicated rows.
authors_no_shots.duplicated().sum()

395

In [20]:
# Remove duplicated rows.
authors_no_shots.drop_duplicates(inplace = True)

In [21]:
# Check if there are some NaN authors into the authors with no shots.
authors_no_shots.isna().sum()

1

In [22]:
# Remove NaN values.
authors_no_shots.dropna(inplace = True)

In [23]:
# Check if there exist an intersection between the users with at least one shot and the users with no shots.
intersection = list(set(shots.author_shot.unique()) & set(authors_no_shots))
intersection

['WongShen',
 'shaun42',
 'M-cony',
 'adityapramana',
 'andy0nly',
 'moveworks',
 'Shawn_',
 'advarto',
 'im_abhishekp',
 'gateui',
 'fishliu',
 'przemob',
 'Tetef',
 'gilsah',
 'skinnersweet',
 'claudiatame7',
 'lekeojo',
 'analemos',
 'Ryan77',
 'dellfi',
 'seahuang',
 'm4rp79',
 'primaua',
 'J_R_Speed',
 'qed42',
 'Gushn',
 'origomez',
 'YoussefEmadEldin',
 'toyfu',
 'isaaclemon',
 'joshbaptista',
 'Mister_Undying',
 'mashenka',
 'julialitkevich',
 'Snow911',
 'Leo_deisgn',
 'hellofello',
 'FabianSM',
 'ivv',
 'noobitter',
 'ShinDoeun',
 'mark41',
 'larsroed',
 'lukas-nkz',
 'humam',
 'Dreamy123',
 'claudiamorales',
 'NANCYGONG']

In [24]:
len(intersection)

48

In [25]:
# Not consider these users into the users with no shots.
authors_no_shots = authors_no_shots.loc[~authors_no_shots.isin(intersection)]

In [26]:
list(set(shots.author_shot.unique()) & set(authors_no_shots))

[]

In [27]:
# We delete all the features containing an 'url' information (not of interest).
shots.drop(["html_url", "attachments_url", "buckets_url", "comments_url", "likes_url", "projects_url", "rebounds_url",
            "team_html_url", "team_avatar_url", "team_links_web", "team_links_twitter", "team_buckets_url", 
            "team_followers_url", "team_following_url", "team_likes_url", "team_projects_url", "team_shots_url", 
            "team_members_url", "team_team_shots_url", "rebound_source_url"], axis = 1, inplace = True)

In [28]:
# We also delete the 'images' url (not of interest).
shots.drop(["images_hidpi", "images_normal", "images_teaser"], axis = 1, inplace = True)

In [29]:
# Rename the 'id' column corresponding to the shot.
shots.rename({"id": "shot_id", "tags": "tag"}, axis = 1, inplace = True)

In [30]:
# Add to the dataframe the information regarding the users with no shots.
no_shots = pd.DataFrame(columns = shots.columns)
no_shots["author_shot"] = authors_no_shots.values

shots = pd.concat([shots, no_shots], ignore_index = False).reset_index(drop = True)

In [31]:
shots.head()

Unnamed: 0,shot_id,title,description,width,height,views_count,likes_count,comments_count,attachments_count,rebounds_count,...,team_projects_count,team_rebounds_received_count,team_shots_count,team_can_upload_shot,team_type,team_pro,team_created_at,team_updated_at,team_members_count,author_shot
0,3549658,Cinema Festival App Part.1,<p>What's up Dribbblers!</...,400,300,1611,51,5,0,0,...,,,,,,,,,,max_palyvoda
1,3254544,Smart Home App Concept,<p>What's up Dribbblers!\n...,400,300,3263,101,7,0,0,...,,,,,,,,,,max_palyvoda
2,3153930,Pre-Order App For Cafe,"<p>Hello, Dribbblers!\n<br...",400,300,5769,192,26,0,0,...,,,,,,,,,,max_palyvoda
3,3732805,Threenity Logo Design,<p>Hellow fellow dribbbler...,400,300,445,44,4,0,0,...,,,,,,,,,,miketanael
4,2686725,Real Estate collaboration ...,<p>ProTitle360 let lawyers...,400,300,1309,27,0,1,0,...,5.0,11.0,121.0,True,Team,False,2014-10-17T01:51:16Z,2017-12-18T17:09:03Z,18.0,citrusbyte


In [32]:
# Check if all the dates end with 'Z': Zulu timezone (UTC). Greenwich zone. No time difference between Greenwich Mean Time and Coordinated Universal Time.
print(shots.team_created_at.dropna().str.endswith("Z").all())
print(shots.team_updated_at.dropna().str.endswith("Z").all())
print(shots.created_at.str.endswith("Z").all())
print(shots.updated_at.str.endswith("Z").all())

True
True
True
True


In [33]:
# Convert to datetime UTC.
# We decide to not save the UTC offset (+00:00).
shots.team_created_at = pd.to_datetime(shots.team_created_at, utc = True).dt.tz_convert(None)
shots.team_updated_at = pd.to_datetime(shots.team_updated_at, utc = True).dt.tz_convert(None)
shots.created_at = pd.to_datetime(shots.created_at, utc = True).dt.tz_convert(None)
shots.updated_at = pd.to_datetime(shots.updated_at, utc = True).dt.tz_convert(None)

In [34]:
# Check if users into 'team_username' are always contained into 'author_shot'.
set(shots.team_username.dropna().unique()).issubset(set(shots.author_shot.unique()))

False

In [35]:
# There are some team usernames that not have any information about the shots published or not published directly.
difference = list(set(shots.team_username.dropna().unique()) - set(shots.author_shot.unique()))
difference

['uberdigital',
 'plasso',
 'cusy',
 'parasut',
 'dsnmfg',
 'monterail',
 'songkick',
 'brandalmanac',
 'CoSchedule',
 'jivesoftware',
 'needledesignteam',
 'TheVariable',
 'allturtles',
 'btc-media',
 'wearewky',
 'powster',
 'lftechnology',
 'betterup',
 'hellokitka',
 'linitix',
 'piasagames',
 'SAP_PI_TOOLS',
 'paddle',
 'InfoShell',
 'autopilothq',
 'beamery',
 'cprojectpro',
 'naturalcycles',
 'pacifichelm',
 'evermix',
 'nwpropaganda',
 'itomychstudio',
 'EventMobi',
 'inboundfit',
 'microsoftedge',
 'saltedstone',
 'prototypeberlin',
 'Bankin',
 'uxdots',
 'etsy',
 'Wizeline',
 'noaveau',
 'operabrowser',
 'pinsight',
 'webstronauts',
 'codefantasy',
 'stevens',
 'hustledesign',
 'bossanova',
 'DueDilDesigner',
 'builtbygood',
 'EDITED',
 'mytaxi',
 'superluckyboy',
 'zhizai',
 'meural',
 'fasten',
 'sillynessco',
 'mirumee',
 'pie_mappping',
 'ammunitionagency',
 'FormAssembly',
 'xo',
 'bithound',
 'tremendousness',
 'sprk',
 'cameo',
 'bigcartel',
 'kayako',
 'MoneySuperMark

In [36]:
len(difference)

445

In [37]:
# Keep only users from 'team_username' that are also in 'author'.
shots = shots[(shots.team_username.isin(shots.author_shot.unique())) | (shots.team_username.isna())]

In [38]:
list(set(shots.team_username.dropna().unique()) - set(shots.author_shot.unique()))

[]

In [39]:
shots.head()

Unnamed: 0,shot_id,title,description,width,height,views_count,likes_count,comments_count,attachments_count,rebounds_count,...,team_projects_count,team_rebounds_received_count,team_shots_count,team_can_upload_shot,team_type,team_pro,team_created_at,team_updated_at,team_members_count,author_shot
0,3549658,Cinema Festival App Part.1,<p>What's up Dribbblers!</...,400,300,1611,51,5,0,0,...,,,,,,,NaT,NaT,,max_palyvoda
1,3254544,Smart Home App Concept,<p>What's up Dribbblers!\n...,400,300,3263,101,7,0,0,...,,,,,,,NaT,NaT,,max_palyvoda
2,3153930,Pre-Order App For Cafe,"<p>Hello, Dribbblers!\n<br...",400,300,5769,192,26,0,0,...,,,,,,,NaT,NaT,,max_palyvoda
3,3732805,Threenity Logo Design,<p>Hellow fellow dribbbler...,400,300,445,44,4,0,0,...,,,,,,,NaT,NaT,,miketanael
4,2686725,Real Estate collaboration ...,<p>ProTitle360 let lawyers...,400,300,1309,27,0,1,0,...,5.0,11.0,121.0,True,Team,False,2014-10-17 01:51:16,2017-12-18 17:09:03,18.0,citrusbyte


In [40]:
len(shots.shot_id.dropna().unique())

2480532

In [41]:
# Check if the 'updated_at' feature is always greater in time than 'created_at' feature.
timediff = shots.updated_at - shots.created_at
sum(timediff < pd.Timedelta(0))

3

In [42]:
anomalies = shots.loc[timediff < pd.Timedelta(0)]
anomalies[["created_at", "updated_at"]]

Unnamed: 0,created_at,updated_at
673241,2017-11-05 06:58:55,2017-11-05 06:00:51
1579052,2017-11-05 06:56:08,2017-11-05 06:16:14
2145251,2014-11-02 06:56:03,2014-11-02 06:11:24


In [43]:
# For these anomalies, we decide to swap the 'created_at' and 'updated_at' features.
shots.loc[anomalies.index] = shots.loc[anomalies.index].rename(columns = {"created_at": "updated_at", "updated_at": "created_at"})

In [44]:
timediff = shots.updated_at - shots.created_at
sum(timediff < pd.Timedelta(0))

0

In [45]:
# Check if the 'team_updated_at' feature is always greater in time than 'team_created_at' feature.
timediff = shots.team_updated_at.dropna() - shots.team_created_at.dropna()
sum(timediff < pd.Timedelta(0))

0

We want to verify the similarities between the teams defined into *users* table and the teams defined in the current *shots* table.

In [46]:
# Read 'users' table.
users = pd.read_sql("SELECT username, type FROM users", conn)

In [47]:
# Check if the 'team_username' is a subset of teams according to users.
set(shots.team_username.dropna().unique()).issubset(set(users[users.type == "Team"]["username"].unique()))

False

In [48]:
# Difference. This user is not considered as team into 'users' table but as 'Player'.
list(set(shots.team_username.dropna().unique()) - set(users[users.type == "Team"]["username"].unique()))

['BrodieP']

In [49]:
len(shots)

3146120

In [50]:
# We decide to not consider this user.
shots = shots[(shots.author_shot != "BrodieP") & (shots.team_username != "BrodieP")]

In [51]:
len(shots)

3146112

## Tags

We have to extract the tags information that is a list of strings. We decide to create an independent dataframe for this information.

In [52]:
tags = shots[["shot_id", "tag", "author_shot"]].set_index(["shot_id", "author_shot"]).explode("tag").reset_index()
tags

Unnamed: 0,shot_id,author_shot,tag
0,3549658.0,max_palyvoda,app
1,3549658.0,max_palyvoda,cinema
2,3549658.0,max_palyvoda,material
3,3549658.0,max_palyvoda,mobile
4,3549658.0,max_palyvoda,movie
...,...,...,...
16632763,,tangduoduo,
16632764,,ioanacioc,
16632765,,DawnGarrett,
16632766,,SmallMili,


In [53]:
# Check duplicated rows.
tags.duplicated().sum()

13413

In [54]:
# Remove duplicated rows.
tags.drop_duplicates(inplace = True)

In [55]:
len(tags)

16619355

In [56]:
len(tags.shot_id.dropna().unique())

2480527

In [57]:
len(tags.author_shot.dropna().unique())

770544

In [58]:
# Save the dataframe into the sql database.
tags.to_sql("tags", conn, index = False, dtype = {"shot_id": "INT", "author_shot": "TEXT", "tag": "TEXT"})

## Shots

In [59]:
# Remove 'tags' feature.
shots.drop("tag", axis = 1, inplace = True)

In [60]:
# Check duplicated rows.
shots.duplicated().sum()

574

In [61]:
# Remove duplicated rows.
shots.drop_duplicates(inplace = True)

In [62]:
len(shots.shot_id.dropna().unique())

2480527

In [63]:
shots.author_shot.isna().sum()

0

In [64]:
len(shots.author_shot.unique())

770544

Let's search the unique key of this table. Let's try the key 'author' + 'team_username' + 'shot_id'.

In [65]:
shots.set_index(["author_shot", "team_username", "shot_id"], inplace = True)

In [66]:
shots.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,title,description,width,height,views_count,likes_count,comments_count,attachments_count,rebounds_count,buckets_count,...,team_likes_received_count,team_projects_count,team_rebounds_received_count,team_shots_count,team_can_upload_shot,team_type,team_pro,team_created_at,team_updated_at,team_members_count
author_shot,team_username,shot_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
max_palyvoda,,3549658,Cinema Festival App Part.1,<p>What's up Dribbblers!</...,400,300,1611,51,5,0,0,5,...,,,,,,,,NaT,NaT,
max_palyvoda,,3254544,Smart Home App Concept,<p>What's up Dribbblers!\n...,400,300,3263,101,7,0,0,16,...,,,,,,,,NaT,NaT,
max_palyvoda,,3153930,Pre-Order App For Cafe,"<p>Hello, Dribbblers!\n<br...",400,300,5769,192,26,0,0,19,...,,,,,,,,NaT,NaT,
miketanael,,3732805,Threenity Logo Design,<p>Hellow fellow dribbbler...,400,300,445,44,4,0,0,0,...,,,,,,,,NaT,NaT,
citrusbyte,citrusbyte,2686725,Real Estate collaboration ...,<p>ProTitle360 let lawyers...,400,300,1309,27,0,1,0,1,...,40.0,5.0,11.0,121.0,True,Team,False,2014-10-17 01:51:16,2017-12-18 17:09:03,18.0


In [67]:
shots.index.is_unique

False

In [68]:
len(shots.index)

3145538

In [69]:
len(shots.index.unique())

3143908

In [70]:
# Let's isolate the profiles that not allow to make unique this key.
not_unique = shots.index.value_counts()
print(sum(not_unique > 1))

not_unique = not_unique[not_unique > 1]

unique = shots.drop(not_unique.index, axis = 0)
not_unique = shots.loc[not_unique.index]

1630


In [71]:
not_unique.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,title,description,width,height,views_count,likes_count,comments_count,attachments_count,rebounds_count,buckets_count,...,team_likes_received_count,team_projects_count,team_rebounds_received_count,team_shots_count,team_can_upload_shot,team_type,team_pro,team_created_at,team_updated_at,team_members_count
author_shot,team_username,shot_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ogbeniseyi,,4015709,Lockup for Wellsprings,,400,300,107,17,2,0,0,1,...,,,,,,,,NaT,NaT,
ogbeniseyi,,4015709,Lockup for Wellsprings,,400,300,130,17,2,0,0,1,...,,,,,,,,NaT,NaT,
Kim_Chen,,3374117,Business icons,<p>Some icons that I did n...,400,300,273,21,3,0,0,0,...,,,,,,,,NaT,NaT,
Kim_Chen,,3374117,Business icons,<p>Some icons that I did n...,400,300,274,21,3,0,0,0,...,,,,,,,,NaT,NaT,
stavangr,,3793638,Product icon concept for n...,<p>Icon idea for an Androi...,400,300,215,16,0,0,0,0,...,,,,,,,,NaT,NaT,


In [72]:
len(not_unique)

3260

Let's try to reduce to an unique information this subset with multiple profiles keeping the rows more updated regarding the two temporal features 'team_updated_at' and 'updated_at'.

In [73]:
not_unique.reset_index(inplace = True)

In [74]:
# Let's keep the rows more updated regarding 'team_updated_at' (valid for only teams and team members).
update = not_unique.groupby(["author_shot", "team_username", "shot_id"], dropna = False)["team_updated_at"].transform("max")

In [75]:
# Update.
not_unique = not_unique[(not_unique["team_updated_at"] == update) | (update.isna())]

In [76]:
# Let's keep the rows more updated regarding 'updated_at' (valid for all users).
update = not_unique.groupby(["author_shot", "team_username", "shot_id"], dropna = False)["updated_at"].transform("max")

In [77]:
# Update.
not_unique = not_unique[not_unique["updated_at"] == update]

In [78]:
# Check the key.
not_unique.set_index(["author_shot", "team_username", "shot_id"], inplace = True)

In [79]:
not_unique.index.is_unique

False

In [80]:
len(not_unique.index)

2937

In [81]:
len(not_unique.index.unique())

1630

In [82]:
not_unique.loc[not_unique.index.value_counts() > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,title,description,width,height,views_count,likes_count,comments_count,attachments_count,rebounds_count,buckets_count,...,team_likes_received_count,team_projects_count,team_rebounds_received_count,team_shots_count,team_can_upload_shot,team_type,team_pro,team_created_at,team_updated_at,team_members_count
author_shot,team_username,shot_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ogbeniseyi,,4015709,Lockup for Wellsprings,,400,300,107,17,2,0,0,1,...,,,,,,,,NaT,NaT,
ogbeniseyi,,4015709,Lockup for Wellsprings,,400,300,130,17,2,0,0,1,...,,,,,,,,NaT,NaT,
Kim_Chen,,3374117,Business icons,<p>Some icons that I did n...,400,300,273,21,3,0,0,0,...,,,,,,,,NaT,NaT,
Kim_Chen,,3374117,Business icons,<p>Some icons that I did n...,400,300,274,21,3,0,0,0,...,,,,,,,,NaT,NaT,
stavangr,,3793638,Product icon concept for n...,<p>Icon idea for an Androi...,400,300,215,16,0,0,0,0,...,,,,,,,,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
upensh,,2886712,Pinterest,<p>I like this style of Pi...,400,300,372,20,0,0,0,0,...,,,,,,,,NaT,NaT,
Kim_Chen,,3402864,404 Page,<p>#daily ui day8 404 page...,400,300,197,10,0,0,1,1,...,,,,,,,,NaT,NaT,
Kim_Chen,,3402864,404 Page,<p>#daily ui day8 404 page...,400,300,198,10,0,0,1,1,...,,,,,,,,NaT,NaT,
akaHomebody,,3037781,$k8rGuRl,"<p>Took a bit of a tumble,...",400,300,3894,105,4,0,0,2,...,,,,,,,,NaT,NaT,


In [83]:
# Let's check what are the features (columns) that change for these profiles.
def features_not_unique(x):
    return x.apply(lambda x: x.nunique()-1 if not x.isnull().all() else 0)

# Percentage of change.
not_unique.loc[not_unique.index.value_counts() > 1].reset_index().groupby(["author_shot", "team_username", "shot_id"], dropna = False).apply(features_not_unique).mean(axis = 0).round(3)

author_shot                     0.000
team_username                   0.000
shot_id                         0.000
title                           0.000
description                     0.000
width                           0.000
height                          0.000
views_count                     0.990
likes_count                     0.265
comments_count                  0.018
attachments_count               0.000
rebounds_count                  0.004
buckets_count                   0.026
created_at                      0.000
updated_at                      0.000
animated                        0.000
team                            0.000
team_id                         0.000
team_name                       0.000
team_bio                        0.000
team_location                   0.000
team_buckets_count              0.000
team_comments_received_count    0.000
team_followers_count            0.000
team_followings_count           0.000
team_likes_count                0.000
team_likes_r

In [84]:
not_unique.reset_index(inplace = True)

In [85]:
# Select the rows with more views of the shot.
not_unique = not_unique[not_unique["views_count"] == not_unique.groupby(["author_shot", "team_username", "shot_id"], dropna = False)["views_count"].transform("max")]

In [86]:
not_unique.set_index(["author_shot", "team_username", "shot_id"]).index.is_unique

False

In [87]:
# Select the rows with more likes of the shot.
not_unique = not_unique[not_unique["likes_count"] == not_unique.groupby(["author_shot", "team_username", "shot_id"], dropna = False)["likes_count"].transform("max")]

In [88]:
not_unique.set_index(["author_shot", "team_username", "shot_id"]).index.is_unique

True

In [89]:
shots = pd.concat([unique.reset_index(), not_unique], ignore_index = False).reset_index(drop = True)

In [90]:
len(shots)

3143908

In [91]:
shots.set_index(["author_shot", "team_username", "shot_id"]).index.is_unique

True

In [92]:
shots.head()

Unnamed: 0,author_shot,team_username,shot_id,title,description,width,height,views_count,likes_count,comments_count,...,team_likes_received_count,team_projects_count,team_rebounds_received_count,team_shots_count,team_can_upload_shot,team_type,team_pro,team_created_at,team_updated_at,team_members_count
0,max_palyvoda,,3549658.0,Cinema Festival App Part.1,<p>What's up Dribbblers!</...,400,300,1611,51,5,...,,,,,,,,NaT,NaT,
1,max_palyvoda,,3254544.0,Smart Home App Concept,<p>What's up Dribbblers!\n...,400,300,3263,101,7,...,,,,,,,,NaT,NaT,
2,max_palyvoda,,3153930.0,Pre-Order App For Cafe,"<p>Hello, Dribbblers!\n<br...",400,300,5769,192,26,...,,,,,,,,NaT,NaT,
3,miketanael,,3732805.0,Threenity Logo Design,<p>Hellow fellow dribbbler...,400,300,445,44,4,...,,,,,,,,NaT,NaT,
4,citrusbyte,citrusbyte,2686725.0,Real Estate collaboration ...,<p>ProTitle360 let lawyers...,400,300,1309,27,0,...,40.0,5.0,11.0,121.0,True,Team,False,2014-10-17 01:51:16,2017-12-18 17:09:03,18.0


In [93]:
# Check duplicated rows.
shots.duplicated().sum()

0

In [94]:
len(shots.shot_id.dropna().unique())

2480527

In [95]:
len(shots.author_shot.unique())

770544

In [96]:
shots.set_index(["shot_id", "author_shot"]).index.is_unique

True

In [97]:
shots.drop("team", axis = 1, inplace = True)

In [98]:
shots.columns

Index(['author_shot', 'team_username', 'shot_id', 'title', 'description',
       'width', 'height', 'views_count', 'likes_count', 'comments_count',
       'attachments_count', 'rebounds_count', 'buckets_count', 'created_at',
       'updated_at', 'animated', 'team_id', 'team_name', 'team_bio',
       'team_location', 'team_buckets_count', 'team_comments_received_count',
       'team_followers_count', 'team_followings_count', 'team_likes_count',
       'team_likes_received_count', 'team_projects_count',
       'team_rebounds_received_count', 'team_shots_count',
       'team_can_upload_shot', 'team_type', 'team_pro', 'team_created_at',
       'team_updated_at', 'team_members_count'],
      dtype='object')

In [99]:
# Save the type for the sqlite table.
dtype = {"shot_id": "INT", "title": "TEXT", "description": "TEXT", "width": "INT", "height": "INT", "views_count": "INT",
         "likes_count": "INT", "comments_count": "INT", "attachments_count": "INT", "rebounds_count": "INT", "buckets_count": "INT",  
         "created_at": "TIMESTAMP", "updated_at": "TIMESTAMP", "animated": "BOOLEAN", "team_username": "TEXT", "author_shot": "TEXT", 
         "team_id": "INT", "team_name": "TEXT", "team_bio": "TEXT", "team_location": "TEXT", "team_buckets_count": "INT", 
         "team_comments_received_count": "INT", "team_followers_count": "INT", "team_followings_count": "INT", 
         "team_likes_count": "INT", "team_likes_received_count": "INT", "team_projects_count": "INT",
         "team_rebounds_received_count": "INT", "team_shots_count": "INT", "team_can_upload_shot": "BOOLEAN", 
         "team_type": "TEXT", "team_pro": "BOOLEAN", "team_created_at": "TIMESTAMP", "team_updated_at": "TIMESTAMP", 
         "team_members_count": "INT"}

In [100]:
# Save the dataframe into the sql database.
shots.to_sql("shots", conn, index = False, dtype = dtype)

In [101]:
conn.close()

### Brief analysis

We analyze the cumulative *_count* features checking for their positive or negative (anomalies) values.

In [102]:
features = ["views_count", "likes_count", "comments_count", "attachments_count", "rebounds_count", 
            "buckets_count"]

In [103]:
for feature in features:
    negatives = len(shots[feature][shots[feature] < 0])
    print("Negative values for '%s': %d of %d" % (feature, negatives, len(shots)))

Negative values for 'views_count': 0 of 3143908
Negative values for 'likes_count': 0 of 3143908
Negative values for 'comments_count': 11 of 3143908
Negative values for 'attachments_count': 7 of 3143908
Negative values for 'rebounds_count': 0 of 3143908
Negative values for 'buckets_count': 0 of 3143908


In [104]:
# Get the subset dataframe with at least one anomaly.
anomalies = shots.loc[(shots[features] < 0).any(axis = 1)]
anomalies.head()

Unnamed: 0,author_shot,team_username,shot_id,title,description,width,height,views_count,likes_count,comments_count,...,team_likes_received_count,team_projects_count,team_rebounds_received_count,team_shots_count,team_can_upload_shot,team_type,team_pro,team_created_at,team_updated_at,team_members_count
721,marcoyu,,2033009.0,Beat Leukemia,<p>A project of Leukemia C...,400,300,1431,55,-1,...,,,,,,,,NaT,NaT,
65918,orthonormai,,2327786.0,Free PSD - Bree Products l...,,400,300,3305,39,-1,...,,,,,,,,NaT,NaT,
76204,rgarcia,,1167203.0,Curupaco Landing Page Prop...,,400,300,1722,45,2,...,,,,,,,,NaT,NaT,
285386,DavidSilberb,,715104.0,King Throne 4 Web,<p>busy doing this illustr...,400,300,139,5,-1,...,,,,,,,,NaT,NaT,
319422,Varsano,,2119172.0,Asia To Go UI Design,,400,300,816,31,-1,...,,,,,,,,NaT,NaT,


In [105]:
len(anomalies)

18