# Week 8: Advanced Data Manipulation – Instructor-led Lab  
Michelle Calderwood | BGEN 632 – Spring 2025

This lab demonstrates advanced filtering, sorting, and transformation techniques using the piping approach in pandas. Each step mirrors operations you'd find in R's tidyverse.

In [9]:
import pandas as pd
import numpy as np
import os

# Set working directory (if needed)
os.chdir("/Users/michellecalderwood/Documents/GitHub/week8labs/data")  # Replace with your path
print(os.getcwd())

# Load dataset
github_df = pd.read_csv("github_teams.csv")
github_df.info()

/Users/michellecalderwood/Documents/GitHub/week8labs/data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608 entries, 0 to 607
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name_h                    608 non-null    object 
 1   Team_type                 608 non-null    object 
 2   Team_size_class           608 non-null    object 
 3   human_members_count       608 non-null    int64  
 4   bot_members_count         608 non-null    int64  
 5   human_work                608 non-null    int64  
 6   work_per_human            608 non-null    float64
 7   human_gini                608 non-null    float64
 8   human_Push                608 non-null    int64  
 9   human_IssueComments       608 non-null    int64  
 10  human_PRReviewComment     608 non-null    int64  
 11  human_MergedPR            608 non-null    int64  
 12  bot_work                  608 non-null    int64  
 13  bot_Pus

### Task 1: Select columns `Team_type`, `human_work`, and `work_per_human`

In [12]:
(github_df
 .filter(["Team_type", "human_work", "work_per_human"])
 .head()
)

Unnamed: 0,Team_type,human_work,work_per_human
0,human-bot,66,33.0
1,human,62,31.0
2,human,211,30.142857
3,human-bot,14579,62.303419
4,human-bot,1625,42.763158


### Task 2: Select columns that end in the letter `t` using regex

In [15]:
(github_df
 .filter(regex="t$")
 .head()
)

Unnamed: 0,human_members_count,bot_members_count,human_PRReviewComment,bot_PRReviewComment,issues_count
0,2,1,4,0,8.0
1,2,0,0,0,
2,7,0,1,0,46.0
3,234,12,1170,0,4757.0
4,38,8,152,0,777.0


### Task 3: Sort by `Team_size_class`, `human_work`, `work_per_human` in descending order

In [22]:
(github_df
 .sort_values(by=["Team_size_class", "human_work", "work_per_human"], ascending=False)
 .head()
)

Unnamed: 0,name_h,Team_type,Team_size_class,human_members_count,bot_members_count,human_work,work_per_human,human_gini,human_Push,human_IssueComments,human_PRReviewComment,human_MergedPR,bot_work,bot_Push,bot_IssueComments,bot_PRReviewComment,bot_MergedPR,eval_survival_day_median,issues_count
559,WLaEz_1Nf-YWzHZa8bBgAA/pLoAhZ1cbPT38VYoSdXGmg,human,Small,3,0,3040,1013.333333,0.292105,434,2606,0,0,0,0,0,0,0,3.0,365.0
209,eIPosZ68E2LjtaixYK65EQ/0Rp6D1ZR1w4YspfD1H-PfA,human-bot,Small,3,1,1639,546.333333,0.41326,1114,283,223,19,156,0,156,0,0,21.0,204.0
410,pxIFRbeuXsUzHtQB3vDwVA/JlkO4fgLE3bD5QjXcU-0CQ,human-bot,Small,3,1,913,304.333333,0.549836,130,507,271,5,53,0,53,0,0,50.0,108.0
268,hXoZRbHPbVxh--funPXSiw/iNU0l_SpKVjGfHOp8vUt8w,human-bot,Small,2,2,910,455.0,0.065934,207,369,334,0,114,0,114,0,0,4.0,64.0
603,zTj5tlMWgotzJmQl7BP8wQ/iQ914_smScbUO8BI9JlE6A,human-bot,Small,3,1,855,285.0,0.474854,423,59,373,0,26,0,26,0,0,,


### Task 4: Filter for `human-bot` teams where `bot_members_count` >= 3

In [27]:
(github_df
 .query("Team_type == 'human-bot' and bot_members_count >= 3")
 .head()
)

Unnamed: 0,name_h,Team_type,Team_size_class,human_members_count,bot_members_count,human_work,work_per_human,human_gini,human_Push,human_IssueComments,human_PRReviewComment,human_MergedPR,bot_work,bot_Push,bot_IssueComments,bot_PRReviewComment,bot_MergedPR,eval_survival_day_median,issues_count
3,_l5u7I5p4thtW5SjR_9_4w/aZNCdVXta7fh7eCMzZP1CA,human-bot,Large,234,12,14579,62.303419,0.738342,1942,11430,1170,37,1972,0,1972,0,0,1.0,4757.0
4,_l5u7I5p4thtW5SjR_9_4w/m_FpD7PKQHqVXHn2bh7u2g,human-bot,Large,38,8,1625,42.763158,0.666607,203,1270,152,0,302,0,302,0,0,2.0,777.0
89,5Is-_ie16OEGmW1arZm8qg/8UeSk2P76pTG7pPLtxsHTQ,human-bot,Large,17,4,7412,436.0,0.439621,4182,1257,1917,56,358,5,202,151,0,2.0,495.0


### Task 5: Filter for `human` teams that are `Large` with `human_gini` >= 0.75

In [32]:
(github_df
 .query("Team_type == 'human' and Team_size_class == 'Large' and human_gini >= 0.75")
 .head()
)

Unnamed: 0,name_h,Team_type,Team_size_class,human_members_count,bot_members_count,human_work,work_per_human,human_gini,human_Push,human_IssueComments,human_PRReviewComment,human_MergedPR,bot_work,bot_Push,bot_IssueComments,bot_PRReviewComment,bot_MergedPR,eval_survival_day_median,issues_count
138,ASYGR96YA91p3z7MNKjZCA/IB2pZ8ygcvNnlxUdysjSFA,human,Large,12,0,1655,137.916667,0.799446,793,684,178,0,0,0,0,0,0,4.0,190.0
285,IiUao8vA_zm_uEIVVLI-Sw/91ya8vlSP8qgwCllH_6BSw,human,Large,25,0,3599,143.96,0.863507,1249,2350,0,0,0,0,0,0,0,0.0,1245.0
505,uLHPO58cQefwrJUbyhYOKQ/7YWOP8uDEeKDHQMWKqOoYA,human,Large,48,0,5748,119.75,0.78204,1715,3891,142,0,0,0,0,0,0,0.0,1200.0
582,y8Jw59EHVSrsluSuhR5okg/V5vb074jNkzg4YCKforX1Q,human,Large,8,0,277,34.625,0.781137,275,2,0,0,0,0,0,0,0,,


### Task 6: Count teams in `Small` or `Large` size class

In [35]:
(github_df
 .query("Team_size_class in ['Small', 'Large']")
 .shape[0]
)

428

### Task 7: Count `Small` or `Large` teams with `human_gini` <= 0.25

In [40]:
(github_df
 .query("Team_size_class in ['Small', 'Large'] and human_gini <= 0.25")
 .shape[0]
)

89

### Task 8: Count number of `human` teams in `Medium` size class

In [45]:
(github_df
 .query("Team_type == 'human' and Team_size_class == 'Medium'")
 .shape[0]
)

96

### Task 9: Save `Team_size_class` and `work_per_human` as new DataFrame

In [50]:
new_df = (github_df
          .filter(["Team_size_class", "work_per_human"])
         )
new_df.head()

Unnamed: 0,Team_size_class,work_per_human
0,Small,33.0
1,Small,31.0
2,Large,30.142857
3,Large,62.303419
4,Large,42.763158


### Task 10: Rename `work_per_human` to `work_inequality`

In [53]:
renamed_df = (new_df
              .rename(columns={
                  'work_per_human': 'work_inequality'
              })
             )
renamed_df.head()

Unnamed: 0,Team_size_class,work_inequality
0,Small,33.0
1,Small,31.0
2,Large,30.142857
3,Large,62.303419
4,Large,42.763158


### Additional: Rename `eval_survival_day_median` to `issue_resolution_time`

In [58]:
renamed_full_df = (github_df
                   .filter(["Team_size_class", "work_per_human", "eval_survival_day_median"])
                   .rename(columns={
                       'work_per_human': 'work_inequality',
                       'eval_survival_day_median': 'issue_resolution_time'
                   })
                  )
renamed_full_df.head()

Unnamed: 0,Team_size_class,work_inequality,issue_resolution_time
0,Small,33.0,87.0
1,Small,31.0,
2,Large,30.142857,37.0
3,Large,62.303419,1.0
4,Large,42.763158,2.0


## References:


[ChatGPT_Instructor.ipynb](ChatGPT_Instructor.ipynb)