Week 7 Instructor-led Lab: Data Manipulation <br>

Author: Parker Munsey <br>
Course: BGEN 632 Grad. Intro. to Python <br>
Term: Spring 2025 <br>

In [1]:
# Import necessary libraries
import pandas as pd
import os 

gh_teams = pd.read_csv(r"C:\Users\TechnellogicPC\UTW\Python\week7labs\github_teams.csv")


# === Inspect Columns ===

# Column headers
gh_teams.columns

# Number of columns and rows
print("Number of columns:", gh_teams.shape[1])
print("Number of rows:", gh_teams.shape[0])

# Print each column name
for column_name in gh_teams.columns:
    print(column_name)

# General dataset info
gh_teams.info()

# === Convert object columns to categorical ===
object_columns = gh_teams.select_dtypes(include='object').columns
gh_teams[object_columns] = gh_teams[object_columns].astype('category')

# === Unique values in 'Team_type' and 'Team_size_class' ===
print("Unique Team_type values:", gh_teams['Team_type'].nunique())
print("Unique Team_size_class values:", gh_teams['Team_size_class'].nunique())

# === Specific Row and Column Queries ===
print("63rd row, 6th column:", gh_teams.iloc[62, 5])
print("300th row values:\n", gh_teams.iloc[299])

# Row with index 595 and first 3 columns using three methods
print(gh_teams.iloc[595, [0, 1, 2]])
print(gh_teams.loc[595, gh_teams.columns[:3]])
print(gh_teams[gh_teams.index == 595][gh_teams.columns[:3]])

# Row with index 46 and 3rd and 7th columns using two methods
print(gh_teams.iloc[46, [2, 6]])
print(gh_teams.loc[46, [gh_teams.columns[2], gh_teams.columns[6]]])

# === Create New DataFrame from bot_work Column ===

# Method 1: Series style
just_bot_work = gh_teams.bot_work
print(just_bot_work)

# Method 2: Create actual DataFrame
just_bot_work_df = pd.DataFrame(gh_teams.bot_work)
print(just_bot_work_df)

# === Sorting and Filtering Data ===

# Human-bot teams with bot_members_count >= 2
human_bot_2plus = gh_teams[(gh_teams['Team_type'] == 'human-bot') & (gh_teams['bot_members_count'] >= 2)]

# Human teams that are Large and human_gini >= 0.75
large_gini = gh_teams[(gh_teams['Team_type'] == 'human') & 
                      (gh_teams['Team_size_class'] == 'Large') & 
                      (gh_teams['human_gini'] >= 0.75)]

# Teams that are Small or Large
small_large_teams = gh_teams[gh_teams['Team_size_class'].isin(['Small', 'Large'])]

# Small or Large teams with human_gini <= 0.20
small_large_low_gini = small_large_teams[small_large_teams['human_gini'] <= 0.20]

# Human-bot teams in the Medium category
medium_human_bot = gh_teams[(gh_teams['Team_type'] == 'human-bot') & 
                            (gh_teams['Team_size_class'] == 'Medium')]

# === Sampling ===

# 50% subsample
subsample_50 = gh_teams.sample(frac=0.5, random_state=42)

# 8-fold cross-validation
fold_size = len(gh_teams) // 8
kfolds = [gh_teams.iloc[i*fold_size:(i+1)*fold_size] for i in range(8)]

# Select numeric columns
numeric_df = gh_teams.select_dtypes(include='number')

# Remove bot_PRReviewComment and bot_MergedPR
numeric_df_cleaned = numeric_df.drop(columns=['bot_PRReviewComment', 'bot_MergedPR'])

# Create new DataFrame with selected columns and rename
subset_df = gh_teams[['Team_size_class', 'human_members_count']].copy()
subset_df.rename(columns={
    'Team_size_class': 'Size_Class',
    'human_members_count': 'Human_Count'
}, inplace=True)

# Display final subset
print(subset_df.head())


Number of columns: 19
Number of rows: 608
name_h
Team_type
Team_size_class
human_members_count
bot_members_count
human_work
work_per_human
human_gini
human_Push
human_IssueComments
human_PRReviewComment
human_MergedPR
bot_work
bot_Push
bot_IssueComments
bot_PRReviewComment
bot_MergedPR
eval_survival_day_median
issues_count
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608 entries, 0 to 607
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name_h                    608 non-null    object 
 1   Team_type                 608 non-null    object 
 2   Team_size_class           608 non-null    object 
 3   human_members_count       608 non-null    int64  
 4   bot_members_count         608 non-null    int64  
 5   human_work                608 non-null    int64  
 6   work_per_human            608 non-null    float64
 7   human_gini                608 non-null    float64
 8   human_Push     

References: <br>

(Converting object columns to category) <br)
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html <br>

(Row/Column Selection with iloc and loc) <br>
https://realpython.com/pandas-python-explore-dataset/#accessing-data-in-pandas <br>

(Creating dataframe from single column) <br>
https://www.w3schools.com/python/pandas/pandas_dataframes.asp <br>

(Pandas DataFrame.sample() <br>
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html <br>

Scikit-Learn Documentation â€“ KFold Cross-Validation <br>
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html <br>
