In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import sqlite3

In [3]:
# Connect to database sqlite where is stored the information regarding the Dribbble dataset.
conn = sqlite3.connect("dribbble_temporary.db", detect_types = sqlite3.PARSE_DECLTYPES)
c = conn.cursor()

# Teams

We want to create a table containing the information regarding the **teams**: we keep the rows where the information regarding the 'team_username' is available (not NaN) from 'shots' table. In this manner, we can find the members of the teams (this information is not available from 'users' table). 

Building team structures using this procedure assumes that if a user really belonging to a team has never published a shot, it will not be included in the 'teams' table that we will build below. These users could be into the list of authors with no shots published but we cannot know which team they belong to.

In [4]:
# Read the dataset using pandas.
teams = pd.read_sql("SELECT author_shot, team_username, shot_id, created_at, updated_at, team_id, team_name, team_bio, team_location, team_buckets_count, team_comments_received_count, team_followers_count, team_followings_count, team_likes_count, team_likes_received_count, team_projects_count, team_rebounds_received_count, team_shots_count, team_can_upload_shot, team_type, team_pro, team_created_at, team_updated_at, team_members_count, id_author_shot, id_team_username FROM shots", conn)
teams.head()

Unnamed: 0,author_shot,team_username,shot_id,created_at,updated_at,team_id,team_name,team_bio,team_location,team_buckets_count,...,team_rebounds_received_count,team_shots_count,team_can_upload_shot,team_type,team_pro,team_created_at,team_updated_at,team_members_count,id_author_shot,id_team_username
0,max_palyvoda,,3549658,2017-06-07 07:01:59,2017-06-07 11:46:56,,,,,,...,,,,,,NaT,NaT,,1364989,
1,max_palyvoda,,3254544,2017-01-31 08:47:31,2017-01-31 09:47:32,,,,,,...,,,,,,NaT,NaT,,1364989,
2,max_palyvoda,,3153930,2016-12-13 10:00:26,2016-12-13 11:07:05,,,,,,...,,,,,,NaT,NaT,,1364989,
3,miketanael,,3732805,2017-08-14 04:10:20,2017-08-14 06:19:32,,,,,,...,,,,,,NaT,NaT,,358178,
4,citrusbyte,citrusbyte,2686725,2016-05-02 04:24:29,2016-09-08 08:32:15,673130.0,Citrusbyte,"We are a global team of strategy, design and d...",Los Angeles,0.0,...,11.0,121.0,1.0,Team,0.0,2014-10-17 01:51:16,2017-12-18 17:09:03,18.0,673130,673130.0


In [5]:
teams = teams.loc[teams.team_username.dropna().index]

In [6]:
# Check duplicated rows.
teams.duplicated().sum()

0

In [7]:
# Rename some column names.
teams.rename({"created_at": "shot_publication_ts", 
              "updated_at": "shot_last_modification_ts", 
              "author_shot": "member_username", 
              "id_author_shot": "id_member_username"}, axis = 1, inplace = True)

In [8]:
teams.set_index(["member_username", "shot_id", "team_username"]).index.is_unique

True

In [9]:
teams.set_index(["member_username", "shot_id"]).index.is_unique

True

In [10]:
def add_features(x):
    # Create new features regarding the pubblications of members of team.
    new_features = pd.Series({"n_of_shots": len(x), 
                              "first_shot": min(x.shot_publication_ts), 
                              "last_shot": max(x.shot_last_modification_ts), 
                              "id_member_username": x.id_member_username.unique()[0], 
                              "id_team_username": x.id_team_username.unique()[0]})
    return new_features

teams_feature = teams.groupby(["member_username", "team_username"]).apply(add_features).reset_index()
teams_feature.head()

Unnamed: 0,member_username,team_username,n_of_shots,first_shot,last_shot,id_member_username,id_team_username
0,-jk-,brandsprogram,9,2015-11-23 15:22:33,2016-10-03 09:15:54,506623,938258.0
1,09ui,hi09ui,2,2015-08-06 02:37:23,2016-06-18 08:16:03,900091,759443.0
2,100shapes,100shapes,4,2015-02-12 11:00:06,2015-05-15 14:42:21,760435,760435.0
3,10Clouds,10Clouds,7,2015-05-12 10:25:27,2017-11-10 10:31:29,845019,845019.0
4,10up,10up,1,2017-12-12 19:34:43,2017-12-12 20:35:43,5473,5473.0


In [11]:
dtype = {"member_username": "TEXT", "team_username": "TEXT", "n_of_shots": "INT", "first_shot": "TIMESTAMP", "last_shot": "TIMESTAMP", "id_member_username": "INT", "id_team_username": "INT"}

In [12]:
# Save the dataframe into the sql database.
teams_feature.to_sql("teams", conn, index = False, dtype = dtype)

In [13]:
conn.close()