In [202]:
import pandas as pd
import networkx as nx
import numpy as np
from collections import defaultdict
from scipy.stats import linregress
from datetime import timedelta
from joblib import Parallel, delayed

In [203]:
project = 'freecad_freecad'

In [204]:
issues = list()
issues_comments = list()
pull_requests = list()
pull_request_comments = list()

temp_df = pd.read_excel(f'Files/{project}_issues.xlsx')
temp_df['project'] = project
issues.append(temp_df)

temp_df = pd.read_excel(f'Files/{project}_issues_comments.xlsx')
temp_df['project'] = project
issues_comments.append(temp_df)

temp_df = pd.read_excel(f'Files/{project}_pull_requests.xlsx')
temp_df['project'] = project
pull_requests.append(temp_df)

temp_df = pd.read_excel(f'Files/{project}_pull_request_comments.xlsx')
temp_df['project'] = project
pull_request_comments.append(temp_df)

In [205]:
df_issues = pd.concat(issues)
df_issues_comments = pd.concat(issues_comments)
df_pull_requests = pd.concat(pull_requests)
df_pull_requests_comments = pd.concat(pull_request_comments)

In [206]:
df_issues['created_by'] = df_issues['created_by']\
    .str.replace('https://api.github.com/users/', '', regex = False)

df_pull_requests['created_by'] = df_pull_requests['created_by']\
    .str.replace('https://api.github.com/users/', '', regex = False)

df_issues_comments['created_by'] = df_issues_comments['created_by']\
    .str.extract(r'login="([^"]+)"')

df_pull_requests_comments['created_by'] = df_pull_requests_comments['created_by']\
    .str.extract(r'login="([^"]+)"')

In [207]:
dim_users = pd.concat([
        df_issues[['created_by', 'project']],
        df_pull_requests[['created_by', 'project']],
        df_issues_comments[['created_by', 'project']],
        df_pull_requests_comments[['created_by', 'project']]
    ])\
    .drop_duplicates()\
    .reset_index()\
    [['created_by', 'project']]

In [208]:
pr_reviewers_by_month = df_pull_requests_comments[['created_by', 'created_at', 'pull_request_id', 'project']]
pr_reviewers_by_month['created_at'] = pd.to_datetime(pr_reviewers_by_month['created_at'], errors='coerce').dt.strftime('%Y-%m')
pr_reviewers_by_month = pr_reviewers_by_month.drop_duplicates()
pr_reviewers_by_month = pr_reviewers_by_month.groupby(['created_at', 'created_by', 'project']).count().reset_index()
pr_reviewers_by_month = pr_reviewers_by_month.rename(columns={'pull_request_id': 'number_of_revisions'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pr_reviewers_by_month['created_at'] = pd.to_datetime(pr_reviewers_by_month['created_at'], errors='coerce').dt.strftime('%Y-%m')


In [209]:
issues_commented = df_issues_comments[['created_by', 'created_at', 'issue_id', 'project']]
issues_commented['created_at'] = pd.to_datetime(issues_commented['created_at'], errors='coerce').dt.strftime('%Y-%m')
issues_commented = issues_commented.drop_duplicates()
issues_commented = issues_commented.groupby(['created_at', 'created_by', 'project']).count().reset_index()
issues_commented = issues_commented.rename(columns={'issue_id': 'number_of_comments_issues'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_commented['created_at'] = pd.to_datetime(issues_commented['created_at'], errors='coerce').dt.strftime('%Y-%m')


In [210]:
issues_created = df_issues[['created_by', 'created_at', 'id', 'project']]
issues_created['created_at'] = pd.to_datetime(issues_created['created_at'], errors='coerce').dt.strftime('%Y-%m')
issues_created = issues_created.drop_duplicates()
issues_created = issues_created.groupby(['created_at', 'created_by', 'project']).count().reset_index()
issues_created = issues_created.rename(columns={'id': 'number_of_issues'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issues_created['created_at'] = pd.to_datetime(issues_created['created_at'], errors='coerce').dt.strftime('%Y-%m')


In [211]:
pull_request_created = df_pull_requests[['created_by', 'created_at', 'id', 'project']]
pull_request_created['created_at'] = pd.to_datetime(pull_request_created['created_at'], errors='coerce').dt.strftime('%Y-%m')
pull_request_created = pull_request_created.drop_duplicates()
pull_request_created = pull_request_created.groupby(['created_at', 'created_by', 'project']).count().reset_index()
pull_request_created = pull_request_created.rename(columns={'id': 'number_of_pr'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pull_request_created['created_at'] = pd.to_datetime(pull_request_created['created_at'], errors='coerce').dt.strftime('%Y-%m')


In [212]:
df_issues_interaction = pd.concat([
        df_issues_comments[['issue_id', 'created_by', 'created_at', 'project']],
        df_issues[['id', 'created_by', 'created_at', 'project']].rename(columns = {'id': 'issue_id'})
    ], ignore_index=True)

df_pr_interaction = pd.concat(
    [
        df_pull_requests_comments[['pull_request_id', 'created_by', 'created_at', 'project']],
        df_pull_requests[['id', 'created_by', 'created_at', 'project']].rename(columns = {'id': 'pull_request_id'})
    ],
    ignore_index=True
)

df_issues_interaction['object'] = 'Issue'
df_pr_interaction['object'] = 'PullRequest'

df_interaction = pd.concat([
        df_pr_interaction.rename(columns = {'pull_request_id': 'id'}),
        df_issues_interaction.rename(columns = {'issue_id': 'id'})
    ],
    ignore_index=True
)

df_interaction['created_at'] = pd.to_datetime(df_interaction['created_at']).dt.strftime('%Y-%m')

In [213]:
df_interaction = df_interaction\
    .sort_values(by = ['object', 'created_at'])\
    .reset_index(drop=True)

interactions = []
interaction_points = defaultdict(int)

for project in df_interaction['project'].unique():
    temp_df = df_interaction[df_interaction['project'] == project]
    
    for obj in temp_df['object'].unique():
        obj_df = temp_df[temp_df['object'] == obj]

        for page in obj_df['id'].unique():
            page_df = obj_df[obj_df['id'] == page]
            
            previous_users = set()

            for _, row in page_df.iterrows():
                current_user = row['created_by']
                created_at = row['created_at']
                
                for user in previous_users:
                    interaction_points[(current_user, user, created_at, project)] += 1
                
                previous_users.add(current_user)

In [214]:
interactions_df = pd.DataFrame(
    [(dev_a, dev_b, created_at, project, points) for (dev_a, dev_b, created_at, project), points in interaction_points.items()],
    columns=['Developer_A', 'Interacted_With', 'Created_At', 'Project', 'Points']
)

In [215]:
aggregated = interactions_df\
    .groupby(['Developer_A', 'Interacted_With', 'Created_At', 'Project'])\
    .agg(Total_Interactions_A_to_B=('Points', 'sum'))\
    .reset_index()

reverse_interactions = interactions_df\
    .rename(columns={'Developer_A': 'Interacted_With', 'Interacted_With': 'Developer_A'})\
    .groupby(['Developer_A', 'Interacted_With', 'Created_At', 'Project'])\
    .agg(Total_Interactions_B_to_A = ('Points', 'sum'))\
    .reset_index()

df = pd.merge(
        aggregated, 
        reverse_interactions, 
        on = ['Developer_A', 'Interacted_With', 'Created_At', 'Project'], 
        how = 'outer'
    ).fillna(0)

df['Relationship_Strength'] = df[['Total_Interactions_A_to_B', 'Total_Interactions_B_to_A']].min(axis=1)
df = df[df['Developer_A'] != df['Interacted_With']]
df = df.rename(columns={'Interacted_With': 'Developer_B', 'Created_At': 'Date'})

In [216]:
'''monthly_metrics = []

for project in df['Project'].drop_duplicates():
    
    temp_df = df[df['Project'] == project]
    
    for month, month_df in temp_df.groupby(temp_df['Date']):
        
        G = nx.Graph()
        
        for _, row in month_df.iterrows():
            G.add_edge(
                row['Developer_A'], 
                row['Developer_B'], 
                weight=row['Relationship_Strength']
            )
        
        degree_centrality = nx.degree_centrality(G)
        betweenness_centrality = nx.betweenness_centrality(G, weight='weight')
        closeness_centrality = nx.closeness_centrality(G)

        for user in G.nodes():
            user_edges = list(G.edges(user, data=True))
            num_relationships = len(user_edges)
            
            avg_strength = (
                sum(edge_data['weight'] for _, _, edge_data in user_edges) / num_relationships
                if num_relationships > 0 else 0
            )

            monthly_metrics.append({
                'user': user,
                'month': month,
                'degree_centrality': degree_centrality.get(user, 0),
                'betweenness_centrality': betweenness_centrality.get(user, 0),
                'closeness_centrality': closeness_centrality.get(user, 0),
                'num_relationships': num_relationships,
                'avg_strength': avg_strength,
                'project': project,
            })


    network_df = pd.DataFrame(monthly_metrics)'''

"monthly_metrics = []\n\nfor project in df['Project'].drop_duplicates():\n    \n    temp_df = df[df['Project'] == project]\n    \n    for month, month_df in temp_df.groupby(temp_df['Date']):\n        \n        G = nx.Graph()\n        \n        for _, row in month_df.iterrows():\n            G.add_edge(\n                row['Developer_A'], \n                row['Developer_B'], \n                weight=row['Relationship_Strength']\n            )\n        \n        degree_centrality = nx.degree_centrality(G)\n        betweenness_centrality = nx.betweenness_centrality(G, weight='weight')\n        closeness_centrality = nx.closeness_centrality(G)\n\n        for user in G.nodes():\n            user_edges = list(G.edges(user, data=True))\n            num_relationships = len(user_edges)\n            \n            avg_strength = (\n                sum(edge_data['weight'] for _, _, edge_data in user_edges) / num_relationships\n                if num_relationships > 0 else 0\n            )\n\n   

In [217]:
def process_month(project, month, month_df):
    G = nx.Graph()
    for _, row in month_df.iterrows():
        G.add_edge(
            row['Developer_A'], 
            row['Developer_B'], 
            weight=row['Relationship_Strength']
        )
    
    degree_centrality = nx.degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G, weight='weight')
    closeness_centrality = nx.closeness_centrality(G)
    
    month_metrics = []
    for user in G.nodes():
        user_edges = list(G.edges(user, data=True))
        num_relationships = len(user_edges)
        
        avg_strength = (
            sum(edge_data['weight'] for _, _, edge_data in user_edges) / num_relationships
            if num_relationships > 0 else 0
        )

        month_metrics.append({
            'user': user,
            'month': month,
            'degree_centrality': degree_centrality.get(user, 0),
            'betweenness_centrality': betweenness_centrality.get(user, 0),
            'closeness_centrality': closeness_centrality.get(user, 0),
            'num_relationships': num_relationships,
            'avg_strength': avg_strength,
            'project': project,
        })
    
    return month_metrics

def process_project_months(project, temp_df):
    results = Parallel(n_jobs=3)(
        delayed(process_month)(project, month, month_df)
        for month, month_df in temp_df.groupby(temp_df['Date'])
    )
    return [item for sublist in results for item in sublist]

projects = df['Project'].drop_duplicates()
results = Parallel(n_jobs=6)(
    delayed(process_project_months)(project, df[df['Project'] == project])
    for project in projects
)

monthly_metrics = [item for sublist in results for item in sublist]
network_df = pd.DataFrame(monthly_metrics)

In [218]:
df = dim_users\
    .merge(
        pr_reviewers_by_month,
        on=['created_by', 'project'], 
        how='outer'
    )\
    .merge(
        issues_created,
        on=['created_by', 'project', 'created_at'], 
        how='outer'
    )\
    .merge(
        issues_commented,
        on=['created_by', 'project', 'created_at'], 
        how='outer'
    )\
    .merge(
        pull_request_created,
        on=['created_by', 'project', 'created_at'], 
        how='outer'
    )\
    .rename(
        columns = {
            'created_by': 'user',
            'created_at': 'month'
        }
    )\
    .merge(
        network_df,
        on=['user', 'month', 'project']
    )

In [219]:
columns_to_fill = [
    'number_of_revisions',
    'number_of_issues',
    'number_of_comments_issues',
    'number_of_pr',
    'degree_centrality',
    'betweenness_centrality',
    'closeness_centrality',
    'num_relationships',
    'avg_strength',
]

df[columns_to_fill] = df[columns_to_fill].fillna(0)
df['inactive_month'] = False

In [220]:
all_rows = []

for (developer, project), group in df.groupby(['user', 'project']):
    group = group.sort_values(by='month', ascending=True)
    group['month'] = pd.to_datetime(group['month'])
    
    start_month = group['month'].iloc[0]
    end_month = group['month'].iloc[-1] + pd.DateOffset(months=12)
    
    full_months = pd.date_range(start=start_month, end=end_month, freq='MS')
    full_months_period = full_months.to_period('M')
        
    existing_months_period = group['month'].dt.to_period('M')
    missing_months = full_months_period.difference(existing_months_period)
    
    if not missing_months.empty:
        for month in missing_months:
            all_rows.append({
                'user': developer,
                'project': project,
                'month': month.start_time.strftime('%Y-%m'),
                'number_of_revisions': 0,
                'number_of_issues': 0,
                'number_of_comments_issues': 0,
                'number_of_pr': 0,
                'degree_centrality': 0,
                'betweenness_centrality': 0,
                'closeness_centrality': 0,
                'num_relationships': 0,
                'avg_strength': 0,
                'inactive_month': True
            })

if all_rows:
    missing_months_df = pd.DataFrame(all_rows)
    df = pd.concat([df, missing_months_df], ignore_index=True).sort_values(by='month', ascending=True)

In [221]:
df = df[df['month'] < '2024-01']

In [222]:
'''df['month_year'] = pd.to_datetime(df['month']).dt.to_period('M')
df = df.sort_values(by='month', ascending=True)
regression_results = list()

for (developer, project), group in df.groupby(['user', 'project']):
    for idx, current_month in enumerate(group['month']):
        
        current_month = pd.to_datetime(current_month)
        start_month = current_month - pd.DateOffset(months=12)

        full_months = pd.date_range(start=start_month, end=current_month, freq='MS')
        full_months_period = full_months.to_period('M')
        
        group_filtered = group[
            (group['month_year'] >= start_month.to_period('M')) & 
            (group['month_year'] <= current_month.to_period('M'))
        ]
        missing_months = full_months_period.difference(group_filtered['month_year'])
        
        list_missing_months = list() 

        for missing_month in missing_months:
            missing_data = {
                'month': pd.to_datetime(str(missing_month)).strftime('%Y-%m'),
                'month_year': missing_month,
                'user': developer,
                'project': project,
                'inactive_month': True,
                'number_of_revisions': 0,
                'number_of_issues': 0,
                'number_of_comments_issues': 0,
                'number_of_pr': 0,
                'degree_centrality': 0,
                'betweenness_centrality': 0,
                'closeness_centrality': 0,
                'num_relationships': 0,
                'avg_strength': 0,
            }
            list_missing_months.append(missing_data)
        
        current_month_analysis = pd\
            .concat([group_filtered, pd.DataFrame(list_missing_months)], ignore_index=True)\
            .sort_values(by = 'month_year', ascending= False)
        
        for window in [3, 6, 9, 12]:
            window_data = current_month_analysis.iloc[0 : window].reset_index(drop=True)
            x = window_data.index
            
            for metric in [
                'number_of_revisions', 
                'number_of_issues', 
                'number_of_comments_issues', 
                'number_of_pr', 
                'degree_centrality', 
                'betweenness_centrality', 
                'closeness_centrality',
                'num_relationships', 
                'avg_strength'
            ]:
                y = window_data[metric]
                slope, intercept, r_value, p_value, std_err = linregress(x, y)
                
                predicted_y = slope * x + intercept
                residuals = y - predicted_y
                std_dev = np.std(residuals)
                
                result = {
                    'user': developer,
                    'project': project,
                    'current_month': current_month,
                    f'{metric}_{window}_slope': slope,
                    f'{metric}_{window}_intercept': intercept,
                    f'{metric}_{window}_std_dev': std_dev
                }
                
                regression_results.append(result)'''

"df['month_year'] = pd.to_datetime(df['month']).dt.to_period('M')\ndf = df.sort_values(by='month', ascending=True)\nregression_results = list()\n\nfor (developer, project), group in df.groupby(['user', 'project']):\n    for idx, current_month in enumerate(group['month']):\n        \n        current_month = pd.to_datetime(current_month)\n        start_month = current_month - pd.DateOffset(months=12)\n\n        full_months = pd.date_range(start=start_month, end=current_month, freq='MS')\n        full_months_period = full_months.to_period('M')\n        \n        group_filtered = group[\n            (group['month_year'] >= start_month.to_period('M')) & \n            (group['month_year'] <= current_month.to_period('M'))\n        ]\n        missing_months = full_months_period.difference(group_filtered['month_year'])\n        \n        list_missing_months = list() \n\n        for missing_month in missing_months:\n            missing_data = {\n                'month': pd.to_datetime(str(missin

In [223]:
def process_month(developer, project, group, current_month):
    regression_results = []
    current_month = pd.to_datetime(current_month)
    start_month = current_month - pd.DateOffset(months=12)

    full_months = pd.date_range(start=start_month, end=current_month, freq='MS')
    full_months_period = full_months.to_period('M')
    
    group_filtered = group[
        (group['month_year'] >= start_month.to_period('M')) & 
        (group['month_year'] <= current_month.to_period('M'))
    ]
    missing_months = full_months_period.difference(group_filtered['month_year'])
    
    list_missing_months = []
    for missing_month in missing_months:
        missing_data = {
            'month': pd.to_datetime(str(missing_month)).strftime('%Y-%m'),
            'month_year': missing_month,
            'user': developer,
            'project': project,
            'inactive_month': True,
            'number_of_revisions': 0,
            'number_of_issues': 0,
            'number_of_comments_issues': 0,
            'number_of_pr': 0,
            'degree_centrality': 0,
            'betweenness_centrality': 0,
            'closeness_centrality': 0,
            'num_relationships': 0,
            'avg_strength': 0,
        }
        list_missing_months.append(missing_data)
    
    current_month_analysis = pd\
        .concat([group_filtered, pd.DataFrame(list_missing_months)], ignore_index=True)\
        .sort_values(by='month_year', ascending=False)
    
    for window in [3, 6, 9, 12]:
        window_data = current_month_analysis.iloc[0:window].reset_index(drop=True)
        x = window_data.index
        
        for metric in [
            'number_of_revisions', 
            'number_of_issues', 
            'number_of_comments_issues', 
            'number_of_pr', 
            'degree_centrality', 
            'betweenness_centrality', 
            'closeness_centrality',
            'num_relationships', 
            'avg_strength'
        ]:
            y = window_data[metric]
            slope, intercept, r_value, p_value, std_err = linregress(x, y)
            
            predicted_y = slope * x + intercept
            residuals = y - predicted_y
            std_dev = np.std(residuals)
            
            result = {
                'user': developer,
                'project': project,
                'current_month': current_month,
                f'{metric}_{window}_slope': slope,
                f'{metric}_{window}_intercept': intercept,
                f'{metric}_{window}_std_dev': std_dev
            }
            
            regression_results.append(result)
    return regression_results

def process_group(developer, project, group):
    group['month_year'] = pd.to_datetime(group['month']).dt.to_period('M')
    group = group.sort_values(by='month', ascending=True)
    
    results = Parallel(n_jobs=2)(
        delayed(process_month)(developer, project, group, current_month)
        for current_month in group['month']
    )
    return [item for sublist in results for item in sublist]

In [224]:
groups = df.groupby(['user', 'project'])

results = Parallel(n_jobs=6)(
    delayed(process_group)(developer, project, group)
    for (developer, project), group in groups
)

regression_results = [result for group_results in results for result in group_results]

In [225]:
regression_df = pd.DataFrame(regression_results)

regression_df_pivot = regression_df.pivot_table(
    index=['user', 'project', 'current_month'],
    aggfunc='first'
)

regression_df_pivot.columns = [f'{col}' for col in regression_df_pivot.columns]
regression_df_pivot.reset_index(inplace=True)

In [226]:
activity_columns = [
    'avg_strength_3_intercept', 'avg_strength_3_slope', 'avg_strength_3_std_dev', 
    'avg_strength_6_intercept', 'avg_strength_6_slope', 'avg_strength_6_std_dev', 
    'avg_strength_9_intercept', 'avg_strength_9_slope', 'avg_strength_9_std_dev', 
    'avg_strength_12_intercept', 'avg_strength_12_slope', 'avg_strength_12_std_dev', 
    'betweenness_centrality_3_intercept', 'betweenness_centrality_3_slope', 'betweenness_centrality_3_std_dev', 
    'betweenness_centrality_6_intercept', 'betweenness_centrality_6_slope', 'betweenness_centrality_6_std_dev', 
    'betweenness_centrality_9_intercept', 'betweenness_centrality_9_slope', 'betweenness_centrality_9_std_dev', 
    'betweenness_centrality_12_intercept', 'betweenness_centrality_12_slope', 'betweenness_centrality_12_std_dev', 
    'closeness_centrality_3_intercept', 'closeness_centrality_3_slope', 'closeness_centrality_3_std_dev', 
    'closeness_centrality_6_intercept', 'closeness_centrality_6_slope', 'closeness_centrality_6_std_dev', 
    'closeness_centrality_9_intercept', 'closeness_centrality_9_slope', 'closeness_centrality_9_std_dev', 
    'closeness_centrality_12_intercept', 'closeness_centrality_12_slope', 'closeness_centrality_12_std_dev', 
    'degree_centrality_3_intercept', 'degree_centrality_3_slope', 'degree_centrality_3_std_dev', 
    'degree_centrality_6_intercept', 'degree_centrality_6_slope', 'degree_centrality_6_std_dev', 
    'degree_centrality_9_intercept', 'degree_centrality_9_slope', 'degree_centrality_9_std_dev', 
    'degree_centrality_12_intercept', 'degree_centrality_12_slope', 'degree_centrality_12_std_dev', 
    'num_relationships_3_intercept', 'num_relationships_3_slope', 'num_relationships_3_std_dev', 
    'num_relationships_6_intercept', 'num_relationships_6_slope', 'num_relationships_6_std_dev', 
    'num_relationships_9_intercept', 'num_relationships_9_slope', 'num_relationships_9_std_dev', 
    'num_relationships_12_intercept', 'num_relationships_12_slope', 'num_relationships_12_std_dev', 
    'number_of_comments_issues_12_intercept', 'number_of_comments_issues_12_slope', 'number_of_comments_issues_12_std_dev', 
    'number_of_comments_issues_3_intercept', 'number_of_comments_issues_3_slope', 'number_of_comments_issues_3_std_dev', 
    'number_of_comments_issues_6_intercept', 'number_of_comments_issues_6_slope', 'number_of_comments_issues_6_std_dev', 
    'number_of_comments_issues_9_intercept', 'number_of_comments_issues_9_slope', 'number_of_comments_issues_9_std_dev', 
    'number_of_issues_3_intercept', 'number_of_issues_3_slope', 'number_of_issues_3_std_dev', 
    'number_of_issues_6_intercept', 'number_of_issues_6_slope', 'number_of_issues_6_std_dev', 
    'number_of_issues_9_intercept', 'number_of_issues_9_slope', 'number_of_issues_9_std_dev', 
    'number_of_issues_12_intercept', 'number_of_issues_12_slope', 'number_of_issues_12_std_dev', 
    'number_of_pr_3_intercept', 'number_of_pr_3_slope', 'number_of_pr_3_std_dev', 
    'number_of_pr_6_intercept', 'number_of_pr_6_slope', 'number_of_pr_6_std_dev', 
    'number_of_pr_9_intercept', 'number_of_pr_9_slope', 'number_of_pr_9_std_dev', 
    'number_of_pr_12_intercept', 'number_of_pr_12_slope', 'number_of_pr_12_std_dev', 
    'number_of_revisions_3_intercept', 'number_of_revisions_3_slope', 'number_of_revisions_3_std_dev', 
    'number_of_revisions_6_intercept', 'number_of_revisions_6_slope', 'number_of_revisions_6_std_dev', 
    'number_of_revisions_9_intercept', 'number_of_revisions_9_slope', 'number_of_revisions_9_std_dev',
    'number_of_revisions_12_intercept', 'number_of_revisions_12_slope', 'number_of_revisions_12_std_dev'
]

regression_df_pivot['turnover_num'] = regression_df_pivot[activity_columns].sum(axis=1)
regression_df_pivot['turnover'] = regression_df_pivot['turnover_num'].apply(lambda x: 'dead' if x == 0 else 'active')

In [227]:
turnover_stats_list = list() 

for (developer, project), group in regression_df_pivot.groupby(['user', 'project']):

    group = group.sort_values(by='current_month')
    active = True 
    modified_group = []

    for idx, row in group.iterrows():
        if row['turnover'] == 'active': 
            active = True
            modified_group.append(row) 

        elif row['turnover'] == 'dead' and active: 
            row['turnover'] = 'dead'
            modified_group.append(row)
            active = False 

        elif row['turnover'] == 'dead' and not active: 
            pass
    
    turnover_stats_list.append(modified_group)

df = pd.concat([pd.DataFrame(group) for group in turnover_stats_list], ignore_index=True)

In [228]:
turnover_stats_list = list()

for (developer, project), group in df.groupby(['user', 'project']):
    group = group.sort_values(by='current_month', ascending=False)
    active = True
    count = 0
    modified_group = []

    for idx, row in group.iterrows():
        if (row['turnover'] == 'active' and count == 0) or (count > 24):
            active = True
            row['time_to_stop_activity'] = 12
            modified_group.append(row)
        
        elif row['turnover'] == 'dead':
            active = False 
            count = 24
            row['turnover'] = 'dead' 
            row['time_to_stop_activity'] = 0
            modified_group.append(row)

        elif active == False:
            if count > 12:
                row['turnover'] = 'pre-death'
                row['time_to_stop_activity'] = 0
            
            elif count == 12:
                row['turnover'] = 'last-worked-month'
                row['time_to_stop_activity'] = 1
            
            else:
                row['turnover'] = 'last-worked-year'
                row['time_to_stop_activity'] = 13 + (count * -1)

            count = count - 1
            modified_group.append(row)

    turnover_stats_list.append(modified_group)

df = pd.concat([pd.DataFrame(group) for group in turnover_stats_list], ignore_index=True)

In [229]:
df.to_parquet(f'ModelTraining/Data/{project}_metrics.parquet')