In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import json
import sqlite3

from pathlib import Path # Use this universal path converter for all types of operatying systems (Windows, Linux, etc).

In [3]:
# Create database sqlite where to store the information regarding the Dribbble dataset.
conn = sqlite3.connect("dribbble_temporary.db") 
c = conn.cursor()

# Users

This file contains information about the Dribbble's **users**. 

The corresponding raw data is the file *users.tsv*.

### Raw data visualization

In [4]:
# Open the dataframe in order to visualize it.
users = pd.read_csv(Path("raw_data/users.tsv"), sep = "\t", header = None, names = ["username", "info"])
users

Unnamed: 0,username,info
0,cmaffuccio,"{""id"": 2005984, ""name"": ""Chris Maffuccio"", ""us..."
1,arestov_design,"{""id"": 695302, ""name"": ""Leonid Arestov"", ""user..."
2,space307,"{""id"": 1946520, ""name"": ""~/.space307"", ""userna..."
3,Hido,"{""id"": 1875583, ""name"": ""Hadeer"", ""username"": ..."
4,Avagana,"{""id"": 1386566, ""name"": ""Avagana"", ""username"":..."
...,...,...
771529,hisevaly,"{""teams_count"": 0, ""likes_url"": ""https://api.d..."
771530,bonehaus,"{""pro"": true, ""likes_received_count"": 41724, ""..."
771531,jonhanlan,"{""pro"": false, ""likes_received_count"": 1049, ""..."
771532,stephenandrewmurrill,"{""pro"": true, ""likes_received_count"": 6022, ""f..."


In [5]:
# We consider only the 'info' column that contains itself information regarding the 'username'.
users = users[["info"]]
users

Unnamed: 0,info
0,"{""id"": 2005984, ""name"": ""Chris Maffuccio"", ""us..."
1,"{""id"": 695302, ""name"": ""Leonid Arestov"", ""user..."
2,"{""id"": 1946520, ""name"": ""~/.space307"", ""userna..."
3,"{""id"": 1875583, ""name"": ""Hadeer"", ""username"": ..."
4,"{""id"": 1386566, ""name"": ""Avagana"", ""username"":..."
...,...
771529,"{""teams_count"": 0, ""likes_url"": ""https://api.d..."
771530,"{""pro"": true, ""likes_received_count"": 41724, ""..."
771531,"{""pro"": false, ""likes_received_count"": 1049, ""..."
771532,"{""pro"": true, ""likes_received_count"": 6022, ""f..."


In [6]:
len(users)

771534

### Unpack information from raw data

In [7]:
# We evaluate each row that contain strings containing Python expressions.
users["info"] = users["info"].apply(lambda x: json.loads(x))

In [8]:
# We normalize semi-structured JSON data into a flat table.
users = pd.json_normalize(users["info"])
users.head()

Unnamed: 0,id,name,username,html_url,avatar_url,bio,location,buckets_count,comments_received_count,followers_count,...,teams_url,created_at,updated_at,links.twitter,members_count,members_url,team_shots_url,links.web,error,message
0,2005984.0,Chris Maffuccio,cmaffuccio,https://dribbble.com/cmaffuccio,https://cdn.dribbble.com/assets/avatar-default...,,"New York, NY",0.0,0.0,0.0,...,https://api.dribbble.com/v1/users/2005984/teams,2017-11-27T02:42:13Z,2017-11-27T02:42:47Z,,,,,,,
1,695302.0,Leonid Arestov,arestov_design,https://dribbble.com/arestov_design,https://cdn.dribbble.com/users/695302/avatars/...,Web &amp; UX/UI designer arestov.design@gmail.com,"Moscow, Russia",5.0,101.0,1106.0,...,https://api.dribbble.com/v1/users/695302/teams,2014-11-17T10:35:07Z,2017-12-12T07:37:24Z,https://twitter.com/arestov_lv,,,,,,
2,1946520.0,~/.space307,space307,https://dribbble.com/space307,https://cdn.dribbble.com/users/1946520/avatars...,"The big, friendly, talented Space307 team is d...","Russia, Saint Petersburg",0.0,0.0,435.0,...,,2017-10-16T15:03:02Z,2017-12-12T11:42:59Z,,8.0,https://api.dribbble.com/v1/teams/1946520/members,https://api.dribbble.com/v1/teams/1946520/shots,http://space307.com,,
3,1875583.0,Hadeer,Hido,https://dribbble.com/Hido,https://cdn.dribbble.com/users/1875583/avatars...,,eygpt,0.0,0.0,0.0,...,https://api.dribbble.com/v1/users/1875583/teams,2017-08-25T13:12:51Z,2017-10-29T11:27:38Z,,,,,,,
4,1386566.0,Avagana,Avagana,https://dribbble.com/Avagana,https://cdn.dribbble.com/users/1386566/avatars...,,,0.0,0.0,0.0,...,https://api.dribbble.com/v1/users/1386566/teams,2016-09-22T12:29:06Z,2017-04-30T15:25:23Z,,,,,,,


### Preprocessing

In [9]:
# Check if all the dates end with 'Z': Zulu timezone (UTC). Greenwich zone. No time difference between Greenwich Mean Time and Coordinated Universal Time.
print(users.created_at.str.endswith("Z").all())
print(users.updated_at.str.endswith("Z").all())

True
True


In [10]:
# Check strings (username) that are not printable. 
users[~users.username.map(lambda x: str(x).isprintable())]

Unnamed: 0,id,name,username,html_url,avatar_url,bio,location,buckets_count,comments_received_count,followers_count,...,teams_url,created_at,updated_at,links.twitter,members_count,members_url,team_shots_url,links.web,error,message
543299,15384.0,,,https://dribbble.com/%C2%9A%C2%9A,https://cdn.dribbble.com/users/15384/avatars/o...,,,0.0,0.0,0.0,...,https://api.dribbble.com/v1/users/15384/teams,2011-02-02T05:59:51Z,2015-11-05T14:40:07Z,,,,,,,


In [11]:
# Not consider this user with username not printable (avoid future issues).
users = users[users.username.map(lambda x: str(x).isprintable())]

In [12]:
users.columns

Index(['id', 'name', 'username', 'html_url', 'avatar_url', 'bio', 'location',
       'buckets_count', 'comments_received_count', 'followers_count',
       'followings_count', 'likes_count', 'likes_received_count',
       'projects_count', 'rebounds_received_count', 'shots_count',
       'teams_count', 'can_upload_shot', 'type', 'pro', 'buckets_url',
       'followers_url', 'following_url', 'likes_url', 'projects_url',
       'shots_url', 'teams_url', 'created_at', 'updated_at', 'links.twitter',
       'members_count', 'members_url', 'team_shots_url', 'links.web', 'error',
       'message'],
      dtype='object')

In [13]:
# Remove the features corresponding to links information (not of interest).
users.drop(["html_url", "avatar_url", "buckets_url", "followers_url", "following_url", "likes_url", "projects_url",
            "shots_url", "teams_url", "links.twitter", "members_url", "team_shots_url", "links.web"], axis = 1, inplace = True)

In [14]:
# Check rows where it is available information regarding the two fields 'error' and 'message'.
errors = users[users[["error", "message"]].notnull().any(axis = 1)]
errors.head()

Unnamed: 0,id,name,username,bio,location,buckets_count,comments_received_count,followers_count,followings_count,likes_count,...,shots_count,teams_count,can_upload_shot,type,pro,created_at,updated_at,members_count,error,message
635404,,,,,,,,,,,...,,,,,,,,,404.0,"{\n ""message"": ""Not found.""\n}"
635406,,,,,,,,,,,...,,,,,,,,,404.0,"{\n ""message"": ""Not found.""\n}"
635407,,,,,,,,,,,...,,,,,,,,,404.0,"{\n ""message"": ""Not found.""\n}"
635408,,,,,,,,,,,...,,,,,,,,,404.0,"{\n ""message"": ""Not found.""\n}"
635411,,,,,,,,,,,...,,,,,,,,,404.0,"{\n ""message"": ""Not found.""\n}"


In [15]:
# Drop columns that contain all NaN values.
errors.dropna(axis = 1, how = "all")

Unnamed: 0,error,message
635404,404.0,"{\n ""message"": ""Not found.""\n}"
635406,404.0,"{\n ""message"": ""Not found.""\n}"
635407,404.0,"{\n ""message"": ""Not found.""\n}"
635408,404.0,"{\n ""message"": ""Not found.""\n}"
635411,404.0,"{\n ""message"": ""Not found.""\n}"
...,...,...
635565,404.0,"{\n ""message"": ""Not found.""\n}"
635569,404.0,"{\n ""message"": ""Not found.""\n}"
635572,404.0,"{\n ""message"": ""Not found.""\n}"
635573,404.0,"{\n ""message"": ""Not found.""\n}"


In [16]:
errors.dropna(axis = 1, how = "all").isna().sum()

error      0
message    0
dtype: int64

In [17]:
# Check 'error' column.
errors.error.value_counts()

404.0    85
Name: error, dtype: int64

In [18]:
# Check 'message' column.
errors.message.value_counts()

{\n  "message": "Not found."\n}    85
Name: message, dtype: int64

When we have a value of 'error' and 'message' all other columns are empty. For this reason, we decide to delete these two columns and the remaining empty rows.

In [19]:
users.drop(["error", "message"], axis = 1, inplace = True)

In [20]:
# Delete rows with all nan values.
users.dropna(how = "all", inplace = True, axis = 0)

In [21]:
users.columns

Index(['id', 'name', 'username', 'bio', 'location', 'buckets_count',
       'comments_received_count', 'followers_count', 'followings_count',
       'likes_count', 'likes_received_count', 'projects_count',
       'rebounds_received_count', 'shots_count', 'teams_count',
       'can_upload_shot', 'type', 'pro', 'created_at', 'updated_at',
       'members_count'],
      dtype='object')

In [22]:
len(users)

771448

In [23]:
# Check duplicated rows.
users.duplicated().sum()

10

In [24]:
# Remove duplicated rows.
users.drop_duplicates(inplace = True)

In [25]:
# Set the right datetime format from Unix Time to UTC.
# We decide to not save the UTC offset (+00:00).
users.updated_at = pd.to_datetime(users.updated_at, utc = True).dt.tz_convert(None)
users.created_at = pd.to_datetime(users.created_at, utc = True).dt.tz_convert(None)

In [26]:
# Check if the 'updated_at' feature is always greater in time than 'created_at' feature.
timediff = users.updated_at - users.created_at
sum(timediff < pd.Timedelta(0))

2

In [27]:
anomalies = users.loc[timediff < pd.Timedelta(0)]
anomalies

Unnamed: 0,id,name,username,bio,location,buckets_count,comments_received_count,followers_count,followings_count,likes_count,...,projects_count,rebounds_received_count,shots_count,teams_count,can_upload_shot,type,pro,created_at,updated_at,members_count
364538,986676.0,Sachin Babu,Mercer006,,"India, Kerala, Ernakulam",0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,False,User,False,2015-11-01 06:54:32,2015-11-01 06:05:19,
659373,1454063.0,夏天Bella,sunshine0195,,,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,False,User,False,2016-11-06 06:48:56,2016-11-06 06:11:35,


In [28]:
# For these anomalies, we decide to swap the 'created_at' and 'updated_at' features.
users.loc[anomalies.index] = users.loc[anomalies.index].rename(columns = {"created_at": "updated_at", "updated_at": "created_at"})

In [29]:
# Recheck.
timediff = users.updated_at - users.created_at
sum(timediff < pd.Timedelta(0))

0

Now, let's see what is the unique key of this dataframe.

In [30]:
users.id.is_unique

False

We have that some users ID have multiple 'profiles'. 

In [31]:
# The number of ID users with multiple profiles.
id_multiple_profiles = users.id.value_counts() > 1
sum(id_multiple_profiles)

689

We want to identify what is the cause of this multiplicity.

In [32]:
# We want to simply visualize the IDs with multiple profiles.
users_multiple_profiles = users.set_index("id")[id_multiple_profiles].set_index("updated_at", append = True).sort_index(axis = 0, level = [0, 1], ascending = False)
users_multiple_profiles.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,username,bio,location,buckets_count,comments_received_count,followers_count,followings_count,likes_count,likes_received_count,projects_count,rebounds_received_count,shots_count,teams_count,can_upload_shot,type,pro,created_at,members_count
id,updated_at,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2047343.0,2017-12-24 04:00:50,Ngoc T,Ngoc,,"Phnom Penh, Cambodia",0.0,0.0,0.0,7.0,99.0,0.0,0.0,0.0,0.0,0.0,False,User,False,2017-12-23 20:44:41,
2047343.0,2017-12-23 20:45:20,Rith,ngoc,,"Singapore, Singapore",0.0,0.0,0.0,7.0,98.0,0.0,0.0,0.0,0.0,0.0,False,User,False,2017-12-23 20:44:41,
2043767.0,2017-12-23 19:38:58,Alex Austin,alexaustin,Dabbling in design.,"Portland, OR",2.0,0.0,0.0,60.0,25.0,0.0,0.0,0.0,0.0,0.0,False,User,False,2017-12-21 02:19:02,
2043767.0,2017-12-21 02:20:51,Alexandria Austin,alexandriaaustin,,"Portland, OR",2.0,0.0,0.0,53.0,9.0,0.0,0.0,0.0,0.0,0.0,False,User,False,2017-12-21 02:19:02,
2039039.0,2017-12-22 08:08:18,QIUPING,qiuping,,"Nerima, Japan",1.0,0.0,3.0,3.0,41.0,0.0,0.0,0.0,0.0,0.0,True,Player,False,2017-12-18 08:45:17,


Now, we want to verify if the cause of this multiplicity is due to the 'updated_at' field. In other words, we want to verify if the key 'id' + 'updated_at' is unique. 

In [33]:
users_multiple_profiles.index.is_unique

True

In [34]:
# Check the users with > 2 updates.
sub_users_multiple_profiles = (users_multiple_profiles.groupby(axis = 0, level = 0)["username"].count() > 2)
sub_users_multiple_profiles = sub_users_multiple_profiles[sub_users_multiple_profiles].index
sub_users_multiple_profiles = users_multiple_profiles[users_multiple_profiles.index.get_level_values(0).isin(sub_users_multiple_profiles)]
sub_users_multiple_profiles

Unnamed: 0_level_0,Unnamed: 1_level_0,name,username,bio,location,buckets_count,comments_received_count,followers_count,followings_count,likes_count,likes_received_count,projects_count,rebounds_received_count,shots_count,teams_count,can_upload_shot,type,pro,created_at,members_count
id,updated_at,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1967609.0,2017-12-28 10:23:31,Λpex,_Apex,T&#39;is the season to be jolly,"Perth, Australia",0.0,0.0,0.0,24.0,57.0,0.0,0.0,0.0,0.0,0.0,False,Prospect,False,2017-10-31 11:49:34,
1967609.0,2017-12-15 11:40:24,Apex,_apex,T&#39;is the season to be jolly,"Perth, Australia",0.0,0.0,0.0,21.0,34.0,0.0,0.0,0.0,0.0,0.0,False,Prospect,False,2017-10-31 11:49:34,
1967609.0,2017-12-09 09:09:22,Poti,Poti,T&#39;is the season to be jolly,"Perth, Australia",0.0,0.0,0.0,20.0,29.0,0.0,0.0,0.0,0.0,0.0,False,Prospect,False,2017-10-31 11:49:34,
1469023.0,2018-01-02 06:57:49,Max,l792836,,,0.0,-2.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,Player,False,2016-11-15 18:38:56,
1469023.0,2017-12-13 15:45:49,Max,taiomori,,,1.0,-2.0,41.0,194.0,782.0,25.0,0.0,0.0,1.0,0.0,True,Player,False,2016-11-15 18:38:56,
1469023.0,2017-12-12 05:04:12,Max,Taiomori,,,1.0,-1.0,38.0,181.0,763.0,122.0,0.0,0.0,3.0,0.0,True,Player,False,2016-11-15 18:38:56,
1435936.0,2017-12-26 16:51:12,SAY BOND,saybond,Sergei Bondarenko\n,,8.0,0.0,0.0,283.0,521.0,0.0,0.0,0.0,0.0,0.0,False,User,False,2016-10-25 07:30:21,
1435936.0,2017-12-14 21:48:35,SERGEI BOND,sergei_bond,,,5.0,0.0,0.0,259.0,511.0,0.0,0.0,0.0,0.0,0.0,False,User,False,2016-10-25 07:30:21,
1435936.0,2017-12-01 22:00:49,Sergey Bondarenko,say_bond,,"Kharkiv, Ukraine",2.0,0.0,0.0,249.0,494.0,0.0,0.0,0.0,0.0,0.0,False,User,False,2016-10-25 07:30:21,
1336043.0,2017-12-17 17:58:08,Ycaro Design,ycarodesign,Design,,0.0,-40.0,46.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,True,Player,False,2016-08-20 11:41:15,


In [35]:
users.columns

Index(['id', 'name', 'username', 'bio', 'location', 'buckets_count',
       'comments_received_count', 'followers_count', 'followings_count',
       'likes_count', 'likes_received_count', 'projects_count',
       'rebounds_received_count', 'shots_count', 'teams_count',
       'can_upload_shot', 'type', 'pro', 'created_at', 'updated_at',
       'members_count'],
      dtype='object')

We solve the problem of these multiple accounts keeping the most updated profiles. During this procedure, we have to keep track of the old 'username' information because the tables related to other datasets (followers, likes, etc.) work on the basis of 'username' key. For some of these non-unique profiles the information on the other datasets could be released on the basis of the old profiles. Therefore if we delete the old profiles without keeping track of the old username we would lose useful information later on. We decide to save this mapping in order to apply it on the usernames of other tables.

In [36]:
# Keep more updated profiles.
users = users[users["updated_at"] == users.groupby("id")["updated_at"].transform("max")]
users.head()

Unnamed: 0,id,name,username,bio,location,buckets_count,comments_received_count,followers_count,followings_count,likes_count,...,projects_count,rebounds_received_count,shots_count,teams_count,can_upload_shot,type,pro,created_at,updated_at,members_count
0,2005984.0,Chris Maffuccio,cmaffuccio,,"New York, NY",0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,False,User,False,2017-11-27 02:42:13,2017-11-27 02:42:47,
1,695302.0,Leonid Arestov,arestov_design,Web &amp; UX/UI designer arestov.design@gmail.com,"Moscow, Russia",5.0,101.0,1106.0,746.0,5456.0,...,4.0,7.0,65.0,0.0,True,Player,True,2014-11-17 10:35:07,2017-12-12 07:37:24,
2,1946520.0,~/.space307,space307,"The big, friendly, talented Space307 team is d...","Russia, Saint Petersburg",0.0,0.0,435.0,1991.0,2447.0,...,0.0,0.0,39.0,,True,Team,False,2017-10-16 15:03:02,2017-12-12 11:42:59,8.0
3,1875583.0,Hadeer,Hido,,eygpt,0.0,0.0,0.0,75.0,82.0,...,0.0,0.0,0.0,0.0,False,User,False,2017-08-25 13:12:51,2017-10-29 11:27:38,
4,1386566.0,Avagana,Avagana,,,0.0,0.0,0.0,93.0,38.0,...,0.0,0.0,0.0,0.0,False,User,False,2016-09-22 12:29:06,2017-04-30 15:25:23,


In [37]:
users.id.is_unique

True

In [38]:
users.username.is_unique

True

In [39]:
users.username.isna().sum()

0

In [40]:
def mapper_username(x):
    most_update = x.iloc[x["updated_at"].argmax()]
    return pd.Series({username: most_update["username"] for username in x.username.unique() if most_update["username"] != username})

map_users_multiple_profiles = users_multiple_profiles.reset_index(level = 1).groupby(axis = 0, level = 0).apply(mapper_username)
map_users_multiple_profiles = map_users_multiple_profiles.reset_index(drop = True, level = 0).to_dict()
map_users_multiple_profiles

{'SebKay': 'SebKayDesign',
 'c187': 'ipushpixels',
 'tobbe': 'tobiaslundin',
 'Magicsoul': 'arturorey',
 'Scorpion_Blood': 'sergioalves',
 'jonvlasach': 'jonway',
 'bysusanlin': 'mintlodica',
 'zooind': 'madebychaun',
 'artingraphics': 'artin_',
 'AngieRoxyE': 'AngelaEWingard',
 'breo': 'breoworx',
 'erseltanir': 'etanir',
 'millersmith': 'briandmiller',
 'stphn': 'stephenandrewmurrill',
 'trzown': 'bonehaus',
 'HelloCiqo': 'hisevaly',
 'hbaumann': 'hilarybaumann',
 'hecsedli': 'petervaro',
 'rkayaaa': 'moveworks',
 'rishishah': 'iamrishishah',
 'jbarros': 'gbarros',
 'mrmcguire': 'calebmcguire',
 'EfrenRascon': 'efrespmx',
 'goldee': 'luzby',
 'shts258': 'seahuang',
 'JackSprout': 'JacksonCole',
 'solarhringur': 'yesidanderfer',
 'JustRLax': 'JLax86',
 'sayedgfx': 'sayed',
 'meta_punk': '8bit_labs',
 'SuperN8': 'nateryansmith',
 'usefulcoding': 'mcampello',
 'IlanDray': 'CBR_Labs',
 'inkod': 'CBR_Labs',
 'cosminn': 'vareo',
 'Oakillustrations': 'oakillustrations',
 'Jamushroom': 'Jero

In [41]:
set(map_users_multiple_profiles.keys()).intersection(map_users_multiple_profiles.values())

set()

In [42]:
# Save the type for the sqlite table.
dtype = {"id": "INT", "name": "TEXT", "username": "TEXT", "bio": "TEXT", "location": "TEXT", "buckets_count": "INT",
         "comments_received_count": "INT", "followers_count": "INT", "followings_count": "INT", "likes_count": "INT", 
         "likes_received_count": "INT", "projects_count": "INT", "rebounds_received_count": "INT", "shots_count": "INT", 
         "teams_count": "INT", "can_upload_shot": "BOOLEAN", "type": "TEXT", "pro": "BOOLEAN", "created_at": "TIMESTAMP", 
         "updated_at": "TIMESTAMP", "members_count": "INT"}

In [43]:
# Save the dataframe into the sql database.
users.to_sql("users", conn, index = False, dtype = dtype)

In [44]:
# Save the map for users with multiple profiles.
json.dump(map_users_multiple_profiles, open("users_mapper_username.json", "w"))

In [45]:
conn.close()

### Brief analysis

We analyze the cumulative *_count* features checking for their positive or negative (anomalies) values.

In [46]:
features = ["comments_received_count", "followers_count", "followings_count", "shots_count", "teams_count", 
            "members_count", "likes_count", "likes_received_count", "buckets_count", "projects_count", 
            "rebounds_received_count"]

In [47]:
for feature in features:
    negatives = len(users[feature][users[feature] < 0])
    print("Negative values for '%s': %d of %d" % (feature, negatives, len(users)))

Negative values for 'comments_received_count': 3192 of 770732
Negative values for 'followers_count': 33 of 770732
Negative values for 'followings_count': 3 of 770732
Negative values for 'shots_count': 1 of 770732
Negative values for 'teams_count': 0 of 770732
Negative values for 'members_count': 0 of 770732
Negative values for 'likes_count': 10 of 770732
Negative values for 'likes_received_count': 11 of 770732
Negative values for 'buckets_count': 0 of 770732
Negative values for 'projects_count': 0 of 770732
Negative values for 'rebounds_received_count': 1 of 770732


In [48]:
# Get the subset dataframe with at least one anomaly.
anomalies = users.loc[(users[features] < 0).any(axis = 1)]
anomalies.head()

Unnamed: 0,id,name,username,bio,location,buckets_count,comments_received_count,followers_count,followings_count,likes_count,...,projects_count,rebounds_received_count,shots_count,teams_count,can_upload_shot,type,pro,created_at,updated_at,members_count
39,194419.0,Adam Zelinski,adamzelinski,Product Designer,"Brisbane, Australia",0.0,-13.0,158.0,390.0,542.0,...,2.0,0.0,13.0,1.0,True,Player,True,2012-08-21 07:02:39,2017-11-30 06:59:02,
91,198829.0,Oleg Gasioshyn,Gasioshyn,Founding Partner &amp; Design Director @ The G...,"Lviv, Ukraine",0.0,-11.0,122.0,226.0,188.0,...,0.0,0.0,4.0,0.0,True,Player,False,2012-08-30 08:53:42,2017-11-02 14:40:05,
97,744044.0,Florian Schulte,schlute,,Germany,2.0,-9.0,150.0,118.0,788.0,...,6.0,1.0,6.0,0.0,True,Player,False,2015-01-24 09:36:36,2017-12-03 15:34:41,
283,798622.0,Gordon Cains,gordoncains,I&#39;m a Product Designer.,"Victoria, BC",0.0,-5.0,57.0,210.0,1113.0,...,1.0,0.0,5.0,0.0,True,Player,True,2015-03-24 06:53:54,2017-12-12 08:39:58,
325,355926.0,Vlad Moroz,vladmoroz,Don&#39;t blow your mind with why’s,Global,0.0,-4.0,42.0,37.0,203.0,...,5.0,0.0,6.0,0.0,True,Player,False,2013-06-23 12:48:01,2017-11-30 10:58:19,


In [49]:
len(anomalies)

3238